peg-parser.cpp 64 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712
  1. #include "common.h"
  2. #include "peg-parser.h"
  3. #include "json-schema-to-grammar.h"
  4. #include "unicode.h"
  5. #include <nlohmann/json.hpp>
  6. #include <algorithm>
  7. #include <initializer_list>
  8. #include <map>
  9. #include <memory>
  10. #include <regex>
  11. #include <stdexcept>
  12. #include <unordered_set>
  13. // Trick to catch missing branches
  14. template <typename T>
  15. inline constexpr bool is_always_false_v = false;
  16. const char * common_peg_parse_result_type_name(common_peg_parse_result_type type) {
  17. switch (type) {
  18. case COMMON_PEG_PARSE_RESULT_FAIL: return "fail";
  19. case COMMON_PEG_PARSE_RESULT_SUCCESS: return "success";
  20. case COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT: return "need_more_input";
  21. default: return "unknown";
  22. }
  23. }
  24. static bool is_hex_digit(const char c) {
  25. return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F');
  26. }
  27. // Trie for matching multiple literals.
  28. // This is used in common_peg_until_parser and to build a GBNF exclusion grammar
  29. struct trie {
  30. struct node {
  31. size_t depth = 0;
  32. std::map<unsigned char, size_t> children;
  33. bool is_word;
  34. };
  35. std::vector<node> nodes;
  36. trie(const std::vector<std::string> & words) {
  37. create_node(); // root node
  38. for (const auto & w : words) {
  39. insert(w);
  40. }
  41. }
  42. enum match_result { NO_MATCH, PARTIAL_MATCH, COMPLETE_MATCH };
  43. // Check if a delimiter starts at the given position
  44. match_result check_at(std::string_view sv, size_t start_pos) const {
  45. size_t current = 0; // Start at root
  46. size_t pos = start_pos;
  47. while (pos < sv.size()) {
  48. auto it = nodes[current].children.find(sv[pos]);
  49. if (it == nodes[current].children.end()) {
  50. // Can't continue matching
  51. return match_result{match_result::NO_MATCH};
  52. }
  53. current = it->second;
  54. pos++;
  55. // Check if we've matched a complete word
  56. if (nodes[current].is_word) {
  57. return match_result{match_result::COMPLETE_MATCH};
  58. }
  59. }
  60. // Reached end of input while still in the trie (not at root)
  61. if (current != 0) {
  62. // We're in the middle of a potential match
  63. return match_result{match_result::PARTIAL_MATCH};
  64. }
  65. // Reached end at root (no match)
  66. return match_result{match_result::NO_MATCH};
  67. }
  68. struct prefix_and_next {
  69. std::string prefix;
  70. std::string next_chars;
  71. };
  72. std::vector<prefix_and_next> collect_prefix_and_next() {
  73. std::string prefix;
  74. std::vector<prefix_and_next> result;
  75. collect_prefix_and_next(0, prefix, result);
  76. return result;
  77. }
  78. private:
  79. void collect_prefix_and_next(size_t index, std::string & prefix, std::vector<prefix_and_next> & out) {
  80. if (!nodes[index].is_word) {
  81. if (!nodes[index].children.empty()) {
  82. std::string chars;
  83. chars.reserve(nodes[index].children.size());
  84. for (const auto & p : nodes[index].children) {
  85. chars.push_back(p.first);
  86. }
  87. out.emplace_back(prefix_and_next{prefix, chars});
  88. }
  89. }
  90. for (const auto & p : nodes[index].children) {
  91. unsigned char ch = p.first;
  92. auto child = p.second;
  93. prefix.push_back(ch);
  94. collect_prefix_and_next(child, prefix, out);
  95. prefix.pop_back();
  96. }
  97. }
  98. size_t create_node() {
  99. size_t index = nodes.size();
  100. nodes.emplace_back();
  101. return index;
  102. }
  103. void insert(const std::string & word) {
  104. size_t current = 0;
  105. for (unsigned char ch : word) {
  106. auto it = nodes[current].children.find(ch);
  107. if (it == nodes[current].children.end()) {
  108. size_t child = create_node();
  109. nodes[child].depth = nodes[current].depth + 1;
  110. nodes[current].children[ch] = child;
  111. current = child;
  112. } else {
  113. current = it->second;
  114. }
  115. }
  116. nodes[current].is_word = true;
  117. }
  118. };
  119. static std::pair<uint32_t, size_t> parse_hex_escape(const std::string & str, size_t pos, int hex_count) {
  120. if (pos + hex_count > str.length()) {
  121. return {0, 0};
  122. }
  123. uint32_t value = 0;
  124. for (int i = 0; i < hex_count; i++) {
  125. char c = str[pos + i];
  126. if (!is_hex_digit(c)) {
  127. return {0, 0};
  128. }
  129. value <<= 4;
  130. if ('a' <= c && c <= 'f') {
  131. value += c - 'a' + 10;
  132. } else if ('A' <= c && c <= 'F') {
  133. value += c - 'A' + 10;
  134. } else if ('0' <= c && c <= '9') {
  135. value += c - '0';
  136. } else {
  137. break;
  138. }
  139. }
  140. return {value, static_cast<size_t>(hex_count)};
  141. }
  142. static std::pair<uint32_t, size_t> parse_char_class_char(const std::string & content, size_t pos) {
  143. if (content[pos] == '\\' && pos + 1 < content.length()) {
  144. switch (content[pos + 1]) {
  145. case 'x': {
  146. auto result = parse_hex_escape(content, pos + 2, 2);
  147. if (result.second > 0) {
  148. return {result.first, 2 + result.second};
  149. }
  150. // Invalid escape, treat as literal 'x'
  151. return {static_cast<uint32_t>('x'), 2};
  152. }
  153. case 'u': {
  154. auto result = parse_hex_escape(content, pos + 2, 4);
  155. if (result.second > 0) {
  156. return {result.first, 2 + result.second};
  157. }
  158. // Invalid escape, treat as literal 'u'
  159. return {static_cast<uint32_t>('u'), 2};
  160. }
  161. case 'U': {
  162. auto result = parse_hex_escape(content, pos + 2, 8);
  163. if (result.second > 0) {
  164. return {result.first, 2 + result.second};
  165. }
  166. // Invalid escape, treat as literal 'U'
  167. return {static_cast<uint32_t>('U'), 2};
  168. }
  169. case 'n': return {'\n', 2};
  170. case 't': return {'\t', 2};
  171. case 'r': return {'\r', 2};
  172. case '\\': return {'\\', 2};
  173. case ']': return {']', 2};
  174. case '[': return {'[', 2};
  175. default: return {static_cast<uint32_t>(content[pos + 1]), 2};
  176. }
  177. }
  178. // Regular character - return as codepoint
  179. return {static_cast<uint32_t>(static_cast<unsigned char>(content[pos])), 1};
  180. }
  181. static std::pair<std::vector<common_peg_chars_parser::char_range>, bool> parse_char_classes(const std::string & classes) {
  182. std::vector<common_peg_chars_parser::char_range> ranges;
  183. bool negated = false;
  184. std::string content = classes;
  185. if (content.front() == '[') {
  186. content = content.substr(1);
  187. }
  188. if (content.back() == ']') {
  189. content.pop_back();
  190. }
  191. // Check for negation
  192. if (!content.empty() && content.front() == '^') {
  193. negated = true;
  194. content = content.substr(1);
  195. }
  196. size_t i = 0;
  197. while (i < content.length()) {
  198. auto [start, start_len] = parse_char_class_char(content, i);
  199. i += start_len;
  200. if (i + 1 < content.length() && content[i] == '-') {
  201. // Range detected
  202. auto [end, end_len] = parse_char_class_char(content, i + 1);
  203. ranges.push_back(common_peg_chars_parser::char_range{start, end});
  204. i += 1 + end_len;
  205. } else {
  206. ranges.push_back(common_peg_chars_parser::char_range{start, start});
  207. }
  208. }
  209. return {ranges, negated};
  210. }
  211. void common_peg_ast_arena::visit(common_peg_ast_id id, const common_peg_ast_visitor & visitor) const {
  212. if (id == COMMON_PEG_INVALID_AST_ID) {
  213. return;
  214. }
  215. const auto & node = get(id);
  216. visitor(node);
  217. for (const auto & child : node.children) {
  218. visit(child, visitor);
  219. }
  220. }
  221. void common_peg_ast_arena::visit(const common_peg_parse_result & result, const common_peg_ast_visitor & visitor) const {
  222. for (const auto & node : result.nodes) {
  223. visit(node, visitor);
  224. }
  225. }
  226. struct parser_executor;
  227. common_peg_parser_id common_peg_arena::add_parser(common_peg_parser_variant parser) {
  228. common_peg_parser_id id = parsers_.size();
  229. parsers_.push_back(std::move(parser));
  230. return id;
  231. }
  232. void common_peg_arena::add_rule(const std::string & name, common_peg_parser_id id) {
  233. rules_[name] = id;
  234. }
  235. common_peg_parser_id common_peg_arena::get_rule(const std::string & name) const {
  236. auto it = rules_.find(name);
  237. if (it == rules_.end()) {
  238. throw std::runtime_error("Rule not found: " + name);
  239. }
  240. return it->second;
  241. }
  242. struct parser_executor {
  243. const common_peg_arena & arena;
  244. common_peg_parse_context & ctx;
  245. size_t start_pos;
  246. parser_executor(const common_peg_arena & arena, common_peg_parse_context & ctx, size_t start)
  247. : arena(arena), ctx(ctx), start_pos(start) {}
  248. common_peg_parse_result operator()(const common_peg_epsilon_parser & /* p */) const {
  249. return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start_pos);
  250. }
  251. common_peg_parse_result operator()(const common_peg_start_parser & /* p */) const {
  252. return common_peg_parse_result(
  253. start_pos == 0 ? COMMON_PEG_PARSE_RESULT_SUCCESS : COMMON_PEG_PARSE_RESULT_FAIL,
  254. start_pos
  255. );
  256. }
  257. common_peg_parse_result operator()(const common_peg_end_parser & /* p */) const {
  258. return common_peg_parse_result(
  259. start_pos >= ctx.input.size() ? COMMON_PEG_PARSE_RESULT_SUCCESS : COMMON_PEG_PARSE_RESULT_FAIL,
  260. start_pos
  261. );
  262. }
  263. common_peg_parse_result operator()(const common_peg_literal_parser & p) {
  264. auto pos = start_pos;
  265. for (auto i = 0u; i < p.literal.size(); ++i) {
  266. if (pos >= ctx.input.size()) {
  267. if (!ctx.is_partial) {
  268. return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos);
  269. }
  270. return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, pos);
  271. }
  272. if (ctx.input[pos] != p.literal[i]) {
  273. return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos);
  274. }
  275. ++pos;
  276. }
  277. return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start_pos, pos);
  278. }
  279. common_peg_parse_result operator()(const common_peg_sequence_parser & p) {
  280. auto pos = start_pos;
  281. std::vector<common_peg_ast_id> nodes;
  282. for (const auto & child_id : p.children) {
  283. auto result = arena.parse(child_id, ctx, pos);
  284. if (result.fail()) {
  285. return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos, result.end);
  286. }
  287. if (!result.nodes.empty()) {
  288. nodes.insert(nodes.end(), result.nodes.begin(), result.nodes.end());
  289. }
  290. if (result.need_more_input()) {
  291. return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, result.end, std::move(nodes));
  292. }
  293. pos = result.end;
  294. }
  295. return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start_pos, pos, std::move(nodes));
  296. }
  297. common_peg_parse_result operator()(const common_peg_choice_parser & p) {
  298. auto pos = start_pos;
  299. for (const auto & child_id : p.children) {
  300. auto result = arena.parse(child_id, ctx, pos);
  301. if (!result.fail()) {
  302. return result;
  303. }
  304. }
  305. return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos);
  306. }
  307. common_peg_parse_result operator()(const common_peg_repetition_parser & p) {
  308. auto pos = start_pos;
  309. int match_count = 0;
  310. std::vector<common_peg_ast_id> nodes;
  311. // Try to match up to max_count times (or unlimited if max_count is -1)
  312. while (p.max_count == -1 || match_count < p.max_count) {
  313. if (pos >= ctx.input.size()) {
  314. break;
  315. }
  316. auto result = arena.parse(p.child, ctx, pos);
  317. if (result.success()) {
  318. // Prevent infinite loop on empty matches
  319. if (result.end == pos) {
  320. break;
  321. }
  322. if (!result.nodes.empty()) {
  323. nodes.insert(nodes.end(), result.nodes.begin(), result.nodes.end());
  324. }
  325. pos = result.end;
  326. match_count++;
  327. continue;
  328. }
  329. if (result.need_more_input()) {
  330. if (!result.nodes.empty()) {
  331. nodes.insert(nodes.end(), result.nodes.begin(), result.nodes.end());
  332. }
  333. return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, result.end, std::move(nodes));
  334. }
  335. // Child failed - stop trying
  336. break;
  337. }
  338. // Check if we got enough matches
  339. if (p.min_count > 0 && match_count < p.min_count) {
  340. if (pos >= ctx.input.size() && ctx.is_partial) {
  341. return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, pos, std::move(nodes));
  342. }
  343. return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos, pos);
  344. }
  345. return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start_pos, pos, std::move(nodes));
  346. }
  347. common_peg_parse_result operator()(const common_peg_and_parser & p) {
  348. auto result = arena.parse(p.child, ctx, start_pos);
  349. // Pass result but don't consume input
  350. return common_peg_parse_result(result.type, start_pos);
  351. }
  352. common_peg_parse_result operator()(const common_peg_not_parser & p) {
  353. auto result = arena.parse(p.child, ctx, start_pos);
  354. if (result.success()) {
  355. // Fail if the underlying parser matches
  356. return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos);
  357. }
  358. if (result.need_more_input()) {
  359. // Propagate - need to know what child would match before negating
  360. return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos);
  361. }
  362. // Child failed, so negation succeeds
  363. return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start_pos);
  364. }
  365. common_peg_parse_result operator()(const common_peg_any_parser & /* p */) const {
  366. // Parse a single UTF-8 codepoint (not just a single byte)
  367. auto result = parse_utf8_codepoint(ctx.input, start_pos);
  368. if (result.status == utf8_parse_result::INCOMPLETE) {
  369. if (!ctx.is_partial) {
  370. return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos);
  371. }
  372. return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos);
  373. }
  374. if (result.status == utf8_parse_result::INVALID) {
  375. return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos);
  376. }
  377. return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start_pos, start_pos + result.bytes_consumed);
  378. }
  379. common_peg_parse_result operator()(const common_peg_space_parser & /* p */) {
  380. auto pos = start_pos;
  381. while (pos < ctx.input.size()) {
  382. auto c = static_cast<unsigned char>(ctx.input[pos]);
  383. if (std::isspace(c)) {
  384. ++pos;
  385. } else {
  386. break;
  387. }
  388. }
  389. return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start_pos, pos);
  390. }
  391. common_peg_parse_result operator()(const common_peg_chars_parser & p) const {
  392. auto pos = start_pos;
  393. int match_count = 0;
  394. // Try to match up to max_count times (or unlimited if max_count is -1)
  395. while (p.max_count == -1 || match_count < p.max_count) {
  396. auto result = parse_utf8_codepoint(ctx.input, pos);
  397. if (result.status == utf8_parse_result::INCOMPLETE) {
  398. if (match_count >= p.min_count) {
  399. // We have enough matches, succeed with what we have
  400. return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start_pos, pos);
  401. }
  402. // Not enough matches yet
  403. if (!ctx.is_partial) {
  404. return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos);
  405. }
  406. return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, pos);
  407. }
  408. if (result.status == utf8_parse_result::INVALID) {
  409. // Malformed UTF-8 in input
  410. if (match_count >= p.min_count) {
  411. // We have enough matches, succeed up to here
  412. return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start_pos, pos);
  413. }
  414. // Not enough matches, fail
  415. return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos);
  416. }
  417. // Check if this codepoint matches our character class
  418. bool matches = false;
  419. for (const auto & range : p.ranges) {
  420. if (range.contains(result.codepoint)) {
  421. matches = true;
  422. break;
  423. }
  424. }
  425. // If negated, invert the match result
  426. if (p.negated) {
  427. matches = !matches;
  428. }
  429. if (matches) {
  430. pos += result.bytes_consumed;
  431. ++match_count;
  432. } else {
  433. // Character doesn't match, stop matching
  434. break;
  435. }
  436. }
  437. // Check if we got enough matches
  438. if (match_count < p.min_count) {
  439. if (pos >= ctx.input.size() && ctx.is_partial) {
  440. return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, pos);
  441. }
  442. return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos, pos);
  443. }
  444. return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start_pos, pos);
  445. }
  446. static common_peg_parse_result handle_escape_sequence(common_peg_parse_context & ctx, size_t start, size_t & pos) {
  447. ++pos; // consume '\'
  448. if (pos >= ctx.input.size()) {
  449. if (!ctx.is_partial) {
  450. return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start);
  451. }
  452. return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start, pos);
  453. }
  454. switch (ctx.input[pos]) {
  455. case '"':
  456. case '\\':
  457. case '/':
  458. case 'b':
  459. case 'f':
  460. case 'n':
  461. case 'r':
  462. case 't':
  463. ++pos;
  464. return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start, pos);
  465. case 'u':
  466. return handle_unicode_escape(ctx, start, pos);
  467. default:
  468. // Invalid escape sequence
  469. return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start);
  470. }
  471. }
  472. static common_peg_parse_result handle_unicode_escape(common_peg_parse_context & ctx, size_t start, size_t & pos) {
  473. ++pos; // consume 'u'
  474. for (int i = 0; i < 4; ++i) {
  475. if (pos >= ctx.input.size()) {
  476. if (!ctx.is_partial) {
  477. return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start);
  478. }
  479. return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start, pos);
  480. }
  481. if (!is_hex_digit(ctx.input[pos])) {
  482. return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start);
  483. }
  484. ++pos;
  485. }
  486. return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start, pos);
  487. }
  488. common_peg_parse_result operator()(const common_peg_json_string_parser & /* p */) {
  489. auto pos = start_pos;
  490. // Parse string content (without quotes)
  491. while (pos < ctx.input.size()) {
  492. char c = ctx.input[pos];
  493. if (c == '"') {
  494. // Found closing quote - success (don't consume it)
  495. return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start_pos, pos);
  496. }
  497. if (c == '\\') {
  498. auto result = handle_escape_sequence(ctx, start_pos, pos);
  499. if (!result.success()) {
  500. return result;
  501. }
  502. } else {
  503. auto utf8_result = parse_utf8_codepoint(ctx.input, pos);
  504. if (utf8_result.status == utf8_parse_result::INCOMPLETE) {
  505. if (!ctx.is_partial) {
  506. return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos);
  507. }
  508. return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, pos);
  509. }
  510. if (utf8_result.status == utf8_parse_result::INVALID) {
  511. return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos);
  512. }
  513. pos += utf8_result.bytes_consumed;
  514. }
  515. }
  516. // Reached end without finding closing quote
  517. if (!ctx.is_partial) {
  518. return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos, pos);
  519. }
  520. return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, pos);
  521. }
  522. common_peg_parse_result operator()(const common_peg_until_parser & p) const {
  523. trie matcher(p.delimiters);
  524. // Scan input and check for delimiters
  525. size_t pos = start_pos;
  526. size_t last_valid_pos = start_pos;
  527. while (pos < ctx.input.size()) {
  528. auto utf8_result = parse_utf8_codepoint(ctx.input, pos);
  529. if (utf8_result.status == utf8_parse_result::INCOMPLETE) {
  530. // Incomplete UTF-8 sequence
  531. if (!ctx.is_partial) {
  532. // Input is complete but UTF-8 is incomplete = malformed
  533. return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos);
  534. }
  535. // Return what we have so far (before incomplete sequence)
  536. return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, last_valid_pos);
  537. }
  538. if (utf8_result.status == utf8_parse_result::INVALID) {
  539. // Malformed UTF-8
  540. return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos);
  541. }
  542. // Check if a delimiter starts at this position
  543. auto match = matcher.check_at(ctx.input, pos);
  544. if (match == trie::COMPLETE_MATCH) {
  545. // Found a complete delimiter, return everything before it
  546. return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start_pos, pos);
  547. }
  548. if (match == trie::PARTIAL_MATCH) {
  549. // Found a partial match extending to end of input, return everything before it
  550. return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start_pos, pos);
  551. }
  552. pos += utf8_result.bytes_consumed;
  553. last_valid_pos = pos;
  554. }
  555. if (last_valid_pos == ctx.input.size() && ctx.is_partial) {
  556. // Reached the end of a partial stream, there might still be more input that we need to consume.
  557. return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, last_valid_pos);
  558. }
  559. return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start_pos, last_valid_pos);
  560. }
  561. common_peg_parse_result operator()(const common_peg_schema_parser & p) {
  562. return arena.parse(p.child, ctx, start_pos);
  563. }
  564. common_peg_parse_result operator()(const common_peg_rule_parser & p) {
  565. // Parse the child
  566. auto result = arena.parse(p.child, ctx, start_pos);
  567. if (!result.fail()) {
  568. std::string_view text;
  569. if (result.start < ctx.input.size()) {
  570. text = std::string_view(ctx.input).substr(result.start, result.end - result.start);
  571. }
  572. auto node_id = ctx.ast.add_node(
  573. p.name,
  574. "",
  575. result.start,
  576. result.end,
  577. text,
  578. std::move(result.nodes),
  579. result.need_more_input()
  580. );
  581. return common_peg_parse_result(result.type, result.start, result.end, { node_id });
  582. }
  583. return result;
  584. }
  585. common_peg_parse_result operator()(const common_peg_tag_parser & p) {
  586. // Parse the child
  587. auto result = arena.parse(p.child, ctx, start_pos);
  588. if (!result.fail()) {
  589. std::string_view text;
  590. if (result.start < ctx.input.size()) {
  591. text = std::string_view(ctx.input).substr(result.start, result.end - result.start);
  592. }
  593. auto node_id = ctx.ast.add_node(
  594. "",
  595. p.tag,
  596. result.start,
  597. result.end,
  598. text,
  599. std::move(result.nodes),
  600. result.need_more_input()
  601. );
  602. return common_peg_parse_result(result.type, result.start, result.end, { node_id });
  603. }
  604. return result;
  605. }
  606. common_peg_parse_result operator()(const common_peg_ref_parser & p) {
  607. auto rule_id = arena.get_rule(p.name);
  608. return arena.parse(rule_id, ctx, start_pos);
  609. }
  610. common_peg_parse_result operator()(const common_peg_atomic_parser & p) {
  611. auto result = arena.parse(p.child, ctx, start_pos);
  612. if (result.need_more_input()) {
  613. // Clear nodes so they don't propagate up.
  614. result.nodes.clear();
  615. }
  616. return result;
  617. }
  618. };
  619. common_peg_parse_result common_peg_arena::parse(common_peg_parse_context & ctx, size_t start) const {
  620. if (root_ == COMMON_PEG_INVALID_PARSER_ID) {
  621. throw std::runtime_error("No root parser set");
  622. }
  623. return parse(root_, ctx, start);
  624. }
  625. common_peg_parse_result common_peg_arena::parse(common_peg_parser_id id, common_peg_parse_context & ctx, size_t start) const {
  626. // Execute parser
  627. const auto & parser = parsers_.at(id);
  628. parser_executor exec(*this, ctx, start);
  629. return std::visit(exec, parser);
  630. }
  631. common_peg_parser_id common_peg_arena::resolve_ref(common_peg_parser_id id) {
  632. const auto & parser = parsers_.at(id);
  633. if (auto ref = std::get_if<common_peg_ref_parser>(&parser)) {
  634. return get_rule(ref->name);
  635. }
  636. return id;
  637. }
  638. void common_peg_arena::resolve_refs() {
  639. // Walk through all parsers and replace refs with their corresponding rule IDs
  640. for (auto & parser : parsers_) {
  641. std::visit([this](auto & p) {
  642. using T = std::decay_t<decltype(p)>;
  643. if constexpr (std::is_same_v<T, common_peg_sequence_parser>) {
  644. for (auto & child : p.children) {
  645. child = resolve_ref(child);
  646. }
  647. } else if constexpr (std::is_same_v<T, common_peg_choice_parser>) {
  648. for (auto & child : p.children) {
  649. child = resolve_ref(child);
  650. }
  651. } else if constexpr (std::is_same_v<T, common_peg_repetition_parser> ||
  652. std::is_same_v<T, common_peg_and_parser> ||
  653. std::is_same_v<T, common_peg_not_parser> ||
  654. std::is_same_v<T, common_peg_tag_parser> ||
  655. std::is_same_v<T, common_peg_atomic_parser>) {
  656. p.child = resolve_ref(p.child);
  657. } else if constexpr (std::is_same_v<T, common_peg_rule_parser>) {
  658. p.child = resolve_ref(p.child);
  659. } else if constexpr (std::is_same_v<T, common_peg_schema_parser>) {
  660. p.child = resolve_ref(p.child);
  661. } else if constexpr (std::is_same_v<T, common_peg_epsilon_parser> ||
  662. std::is_same_v<T, common_peg_start_parser> ||
  663. std::is_same_v<T, common_peg_end_parser> ||
  664. std::is_same_v<T, common_peg_ref_parser> ||
  665. std::is_same_v<T, common_peg_until_parser> ||
  666. std::is_same_v<T, common_peg_literal_parser> ||
  667. std::is_same_v<T, common_peg_json_string_parser> ||
  668. std::is_same_v<T, common_peg_chars_parser> ||
  669. std::is_same_v<T, common_peg_any_parser> ||
  670. std::is_same_v<T, common_peg_space_parser>) {
  671. // These rules do not have children
  672. } else {
  673. static_assert(is_always_false_v<T>);
  674. }
  675. }, parser);
  676. }
  677. // Also flatten root if it's a ref
  678. if (root_ != COMMON_PEG_INVALID_PARSER_ID) {
  679. root_ = resolve_ref(root_);
  680. }
  681. }
  682. std::string common_peg_arena::dump(common_peg_parser_id id) const {
  683. const auto & parser = parsers_.at(id);
  684. return std::visit([this](const auto & p) -> std::string {
  685. using T = std::decay_t<decltype(p)>;
  686. if constexpr (std::is_same_v<T, common_peg_epsilon_parser>) {
  687. return "Epsilon";
  688. } else if constexpr (std::is_same_v<T, common_peg_start_parser>) {
  689. return "Start";
  690. } else if constexpr (std::is_same_v<T, common_peg_end_parser>) {
  691. return "End";
  692. } else if constexpr (std::is_same_v<T, common_peg_literal_parser>) {
  693. return "Literal(" + p.literal + ")";
  694. } else if constexpr (std::is_same_v<T, common_peg_sequence_parser>) {
  695. std::vector<std::string> parts;
  696. for (const auto & child : p.children) {
  697. parts.push_back(dump(child));
  698. }
  699. return "Sequence(" + string_join(parts, ", ") + ")";
  700. } else if constexpr (std::is_same_v<T, common_peg_choice_parser>) {
  701. std::vector<std::string> parts;
  702. for (const auto & child : p.children) {
  703. parts.push_back(dump(child));
  704. }
  705. return "Choice(" + string_join(parts, ", ") + ")";
  706. } else if constexpr (std::is_same_v<T, common_peg_repetition_parser>) {
  707. if (p.max_count == -1) {
  708. return "Repetition(" + dump(p.child) + ", " + std::to_string(p.min_count) + ", unbounded)";
  709. }
  710. return "Repetition(" + dump(p.child) + ", " + std::to_string(p.min_count) + ", " + std::to_string(p.max_count) + ")";
  711. } else if constexpr (std::is_same_v<T, common_peg_and_parser>) {
  712. return "And(" + dump(p.child) + ")";
  713. } else if constexpr (std::is_same_v<T, common_peg_not_parser>) {
  714. return "Not(" + dump(p.child) + ")";
  715. } else if constexpr (std::is_same_v<T, common_peg_any_parser>) {
  716. return "Any";
  717. } else if constexpr (std::is_same_v<T, common_peg_space_parser>) {
  718. return "Space";
  719. } else if constexpr (std::is_same_v<T, common_peg_chars_parser>) {
  720. if (p.max_count == -1) {
  721. return "CharRepeat(" + p.pattern + ", " + std::to_string(p.min_count) + ", unbounded)";
  722. }
  723. return "CharRepeat(" + p.pattern + ", " + std::to_string(p.min_count) + ", " + std::to_string(p.max_count) + ")";
  724. } else if constexpr (std::is_same_v<T, common_peg_json_string_parser>) {
  725. return "JsonString()";
  726. } else if constexpr (std::is_same_v<T, common_peg_until_parser>) {
  727. return "Until(" + string_join(p.delimiters, " | ") + ")";
  728. } else if constexpr (std::is_same_v<T, common_peg_schema_parser>) {
  729. return "Schema(" + dump(p.child) + ", " + (p.schema ? p.schema->dump() : "null") + ")";
  730. } else if constexpr (std::is_same_v<T, common_peg_rule_parser>) {
  731. return "Rule(" + p.name + ", " + dump(p.child) + ")";
  732. } else if constexpr (std::is_same_v<T, common_peg_ref_parser>) {
  733. return "Ref(" + p.name + ")";
  734. } else {
  735. return "Unknown";
  736. }
  737. }, parser);
  738. }
  739. common_peg_parser & common_peg_parser::operator=(const common_peg_parser & other) {
  740. id_ = other.id_;
  741. return *this;
  742. }
  743. common_peg_parser & common_peg_parser::operator+=(const common_peg_parser & other) {
  744. id_ = builder_.sequence({id_, other.id_});
  745. return *this;
  746. }
  747. common_peg_parser & common_peg_parser::operator|=(const common_peg_parser & other) {
  748. id_ = builder_.choice({id_, other.id_});
  749. return *this;
  750. }
  751. common_peg_parser common_peg_parser::operator+(const common_peg_parser & other) const {
  752. return builder_.sequence({id_, other.id_});
  753. }
  754. common_peg_parser common_peg_parser::operator|(const common_peg_parser & other) const {
  755. return builder_.choice({id_, other.id_});
  756. }
  757. common_peg_parser common_peg_parser::operator<<(const common_peg_parser & other) const {
  758. return builder_.sequence({id_, builder_.space(), other.id_});
  759. }
  760. common_peg_parser common_peg_parser::operator+(const char * str) const {
  761. return *this + builder_.literal(str);
  762. }
  763. common_peg_parser common_peg_parser::operator+(const std::string & str) const {
  764. return *this + builder_.literal(str);
  765. }
  766. common_peg_parser common_peg_parser::operator<<(const char * str) const {
  767. return *this << builder_.literal(str);
  768. }
  769. common_peg_parser common_peg_parser::operator<<(const std::string & str) const {
  770. return *this << builder_.literal(str);
  771. }
  772. common_peg_parser common_peg_parser::operator|(const char * str) const {
  773. return *this | builder_.literal(str);
  774. }
  775. common_peg_parser common_peg_parser::operator|(const std::string & str) const {
  776. return *this | builder_.literal(str);
  777. }
  778. common_peg_parser operator+(const char * str, const common_peg_parser & p) {
  779. return p.builder().literal(str) + p;
  780. }
  781. common_peg_parser operator+(const std::string & str, const common_peg_parser & p) {
  782. return operator+(str.c_str(), p);
  783. }
  784. common_peg_parser operator<<(const char * str, const common_peg_parser & p) {
  785. return p.builder().literal(str) << p;
  786. }
  787. common_peg_parser operator<<(const std::string & str, const common_peg_parser & p) {
  788. return operator<<(str.c_str(), p);
  789. }
  790. common_peg_parser operator|(const char * str, const common_peg_parser & p) {
  791. return p.builder().literal(str) | p;
  792. }
  793. common_peg_parser operator|(const std::string & str, const common_peg_parser & p) {
  794. return operator|(str.c_str(), p);
  795. }
  796. static std::string rule_name(const std::string & name) {
  797. static const std::regex invalid_rule_chars_re("[^a-zA-Z0-9-]+");
  798. return std::regex_replace(name, invalid_rule_chars_re, "-");
  799. }
  800. common_peg_parser_builder::common_peg_parser_builder() {}
  801. common_peg_parser common_peg_parser_builder::sequence(const std::vector<common_peg_parser_id> & parsers) {
  802. // Flatten nested sequences
  803. std::vector<common_peg_parser_id> flattened;
  804. for (const auto & p : parsers) {
  805. const auto & parser = arena_.get(p);
  806. if (auto seq = std::get_if<common_peg_sequence_parser>(&parser)) {
  807. flattened.insert(flattened.end(), seq->children.begin(), seq->children.end());
  808. } else {
  809. flattened.push_back(p);
  810. }
  811. }
  812. return wrap(arena_.add_parser(common_peg_sequence_parser{flattened}));
  813. }
  814. common_peg_parser common_peg_parser_builder::sequence(const std::vector<common_peg_parser> & parsers) {
  815. std::vector<common_peg_parser_id> ids;
  816. ids.reserve(parsers.size());
  817. for (const auto & p : parsers) {
  818. ids.push_back(p.id());
  819. }
  820. return sequence(ids);
  821. }
  822. common_peg_parser common_peg_parser_builder::sequence(std::initializer_list<common_peg_parser> parsers) {
  823. std::vector<common_peg_parser_id> ids;
  824. ids.reserve(parsers.size());
  825. for (const auto & p : parsers) {
  826. ids.push_back(p.id());
  827. }
  828. return sequence(ids);
  829. }
  830. common_peg_parser common_peg_parser_builder::choice(const std::vector<common_peg_parser_id> & parsers) {
  831. // Flatten nested choices
  832. std::vector<common_peg_parser_id> flattened;
  833. for (const auto & p : parsers) {
  834. const auto & parser = arena_.get(p);
  835. if (auto choice = std::get_if<common_peg_choice_parser>(&parser)) {
  836. flattened.insert(flattened.end(), choice->children.begin(), choice->children.end());
  837. } else {
  838. flattened.push_back(p);
  839. }
  840. }
  841. return wrap(arena_.add_parser(common_peg_choice_parser{flattened}));
  842. }
  843. common_peg_parser common_peg_parser_builder::choice(const std::vector<common_peg_parser> & parsers) {
  844. std::vector<common_peg_parser_id> ids;
  845. ids.reserve(parsers.size());
  846. for (const auto & p : parsers) {
  847. ids.push_back(p.id());
  848. }
  849. return choice(ids);
  850. }
  851. common_peg_parser common_peg_parser_builder::choice(std::initializer_list<common_peg_parser> parsers) {
  852. std::vector<common_peg_parser_id> ids;
  853. ids.reserve(parsers.size());
  854. for (const auto & p : parsers) {
  855. ids.push_back(p.id());
  856. }
  857. return choice(ids);
  858. }
  859. common_peg_parser common_peg_parser_builder::chars(const std::string & classes, int min, int max) {
  860. auto [ranges, negated] = parse_char_classes(classes);
  861. return wrap(arena_.add_parser(common_peg_chars_parser{classes, ranges, negated, min, max}));
  862. }
  863. common_peg_parser common_peg_parser_builder::schema(const common_peg_parser & p, const std::string & name, const nlohmann::ordered_json & schema, bool raw) {
  864. return wrap(arena_.add_parser(common_peg_schema_parser{p.id(), name, std::make_shared<nlohmann::ordered_json>(schema), raw}));
  865. }
  866. common_peg_parser common_peg_parser_builder::rule(const std::string & name, const common_peg_parser & p, bool trigger) {
  867. auto clean_name = rule_name(name);
  868. auto rule_id = arena_.add_parser(common_peg_rule_parser{clean_name, p.id(), trigger});
  869. arena_.add_rule(clean_name, rule_id);
  870. return ref(clean_name);
  871. }
  872. common_peg_parser common_peg_parser_builder::rule(const std::string & name, const std::function<common_peg_parser()> & builder_fn, bool trigger) {
  873. auto clean_name = rule_name(name);
  874. if (arena_.has_rule(clean_name)) {
  875. return ref(clean_name);
  876. }
  877. // Create placeholder rule to allow recursive references
  878. auto placeholder = any(); // Temporary placeholder
  879. auto placeholder_rule_id = arena_.add_parser(common_peg_rule_parser{clean_name, placeholder.id(), trigger});
  880. arena_.add_rule(clean_name, placeholder_rule_id);
  881. // Build the actual parser
  882. auto parser = builder_fn();
  883. // Replace placeholder with actual rule
  884. auto rule_id = arena_.add_parser(common_peg_rule_parser{clean_name, parser.id(), trigger});
  885. arena_.rules_[clean_name] = rule_id;
  886. return ref(clean_name);
  887. }
  888. void common_peg_parser_builder::set_root(const common_peg_parser & p) {
  889. arena_.set_root(p.id());
  890. }
  891. common_peg_arena common_peg_parser_builder::build() {
  892. arena_.resolve_refs();
  893. return std::move(arena_);
  894. }
  895. // JSON parsers
  896. common_peg_parser common_peg_parser_builder::json_number() {
  897. return rule("json-number", [this]() {
  898. auto digit1_9 = chars("[1-9]", 1, 1);
  899. auto digits = chars("[0-9]");
  900. auto int_part = choice({literal("0"), sequence({digit1_9, chars("[0-9]", 0, -1)})});
  901. auto frac = sequence({literal("."), digits});
  902. auto exp = sequence({choice({literal("e"), literal("E")}), optional(chars("[+-]", 1, 1)), digits});
  903. return sequence({optional(literal("-")), int_part, optional(frac), optional(exp), space()});
  904. });
  905. }
  906. common_peg_parser common_peg_parser_builder::json_string() {
  907. return rule("json-string", [this]() {
  908. return sequence({literal("\""), json_string_content(), literal("\""), space()});
  909. });
  910. }
  911. common_peg_parser common_peg_parser_builder::json_bool() {
  912. return rule("json-bool", [this]() {
  913. return sequence({choice({literal("true"), literal("false")}), space()});
  914. });
  915. }
  916. common_peg_parser common_peg_parser_builder::json_null() {
  917. return rule("json-null", [this]() {
  918. return sequence({literal("null"), space()});
  919. });
  920. }
  921. common_peg_parser common_peg_parser_builder::json_object() {
  922. return rule("json-object", [this]() {
  923. auto ws = space();
  924. auto member = sequence({json_string(), ws, literal(":"), ws, json()});
  925. auto members = sequence({member, zero_or_more(sequence({ws, literal(","), ws, member}))});
  926. return sequence({
  927. literal("{"),
  928. ws,
  929. choice({
  930. literal("}"),
  931. sequence({members, ws, literal("}")})
  932. }),
  933. ws
  934. });
  935. });
  936. }
  937. common_peg_parser common_peg_parser_builder::json_array() {
  938. return rule("json-array", [this]() {
  939. auto ws = space();
  940. auto elements = sequence({json(), zero_or_more(sequence({literal(","), ws, json()}))});
  941. return sequence({
  942. literal("["),
  943. ws,
  944. choice({
  945. literal("]"),
  946. sequence({elements, ws, literal("]")})
  947. }),
  948. ws
  949. });
  950. });
  951. }
  952. common_peg_parser common_peg_parser_builder::json() {
  953. return rule("json-value", [this]() {
  954. return choice({
  955. json_object(),
  956. json_array(),
  957. json_string(),
  958. json_number(),
  959. json_bool(),
  960. json_null()
  961. });
  962. });
  963. }
  964. common_peg_parser common_peg_parser_builder::json_string_content() {
  965. return wrap(arena_.add_parser(common_peg_json_string_parser{}));
  966. }
  967. common_peg_parser common_peg_parser_builder::json_member(const std::string & key, const common_peg_parser & p) {
  968. auto ws = space();
  969. return sequence({
  970. literal("\"" + key + "\""),
  971. ws,
  972. literal(":"),
  973. ws,
  974. p,
  975. });
  976. }
  977. static std::string gbnf_escape_char_class(char c) {
  978. switch (c) {
  979. case '\n': return "\\n";
  980. case '\t': return "\\t";
  981. case '\r': return "\\r";
  982. case '\\': return "\\\\";
  983. case ']': return "\\]";
  984. case '[': return "\\[";
  985. default: return std::string(1, c);
  986. }
  987. }
  988. static std::string gbnf_excluding_pattern(const std::vector<std::string> & strings) {
  989. trie matcher(strings);
  990. auto pieces = matcher.collect_prefix_and_next();
  991. std::string pattern;
  992. for (size_t i = 0; i < pieces.size(); ++i) {
  993. if (i > 0) {
  994. pattern += " | ";
  995. }
  996. const auto & pre = pieces[i].prefix;
  997. const auto & chars = pieces[i].next_chars;
  998. std::string cls;
  999. cls.reserve(chars.size());
  1000. for (const auto & ch : chars) {
  1001. cls += gbnf_escape_char_class(ch);
  1002. }
  1003. if (!pre.empty()) {
  1004. pattern += gbnf_format_literal(pre) + " [^" + cls + "]";
  1005. } else {
  1006. pattern += "[^" + cls + "]";
  1007. }
  1008. }
  1009. return "(" + pattern + ")*";
  1010. }
  1011. static std::unordered_set<std::string> collect_reachable_rules(
  1012. const common_peg_arena & arena,
  1013. const common_peg_parser_id & rule
  1014. ) {
  1015. std::unordered_set<std::string> reachable;
  1016. std::unordered_set<std::string> visited;
  1017. std::function<void(common_peg_parser_id)> visit = [&](common_peg_parser_id id) {
  1018. const auto & parser = arena.get(id);
  1019. std::visit([&](const auto & p) {
  1020. using T = std::decay_t<decltype(p)>;
  1021. if constexpr (std::is_same_v<T, common_peg_epsilon_parser> ||
  1022. std::is_same_v<T, common_peg_start_parser> ||
  1023. std::is_same_v<T, common_peg_end_parser> ||
  1024. std::is_same_v<T, common_peg_until_parser> ||
  1025. std::is_same_v<T, common_peg_literal_parser> ||
  1026. std::is_same_v<T, common_peg_chars_parser> ||
  1027. std::is_same_v<T, common_peg_space_parser> ||
  1028. std::is_same_v<T, common_peg_any_parser> ||
  1029. std::is_same_v<T, common_peg_json_string_parser>) {
  1030. // These parsers do not have any children
  1031. } else if constexpr (std::is_same_v<T, common_peg_sequence_parser>) {
  1032. for (auto child : p.children) {
  1033. visit(child);
  1034. }
  1035. } else if constexpr (std::is_same_v<T, common_peg_choice_parser>) {
  1036. for (auto child : p.children) {
  1037. visit(child);
  1038. }
  1039. } else if constexpr (std::is_same_v<T, common_peg_repetition_parser> ||
  1040. std::is_same_v<T, common_peg_and_parser> ||
  1041. std::is_same_v<T, common_peg_not_parser> ||
  1042. std::is_same_v<T, common_peg_tag_parser> ||
  1043. std::is_same_v<T, common_peg_atomic_parser> ||
  1044. std::is_same_v<T, common_peg_schema_parser>) {
  1045. visit(p.child);
  1046. } else if constexpr (std::is_same_v<T, common_peg_rule_parser>) {
  1047. if (visited.find(p.name) == visited.end()) {
  1048. visited.insert(p.name);
  1049. reachable.insert(p.name);
  1050. visit(p.child);
  1051. }
  1052. } else if constexpr (std::is_same_v<T, common_peg_ref_parser>) {
  1053. // Traverse rules so we pick up everything
  1054. auto referenced_rule = arena.get_rule(p.name);
  1055. visit(referenced_rule);
  1056. } else {
  1057. static_assert(is_always_false_v<T>);
  1058. }
  1059. }, parser);
  1060. };
  1061. visit(rule);
  1062. return reachable;
  1063. }
  1064. // GBNF generation implementation
  1065. void common_peg_arena::build_grammar(const common_grammar_builder & builder, bool lazy) const {
  1066. // Generate GBNF for a parser
  1067. std::function<std::string(common_peg_parser_id)> to_gbnf = [&](common_peg_parser_id id) -> std::string {
  1068. const auto & parser = parsers_.at(id);
  1069. return std::visit([&](const auto & p) -> std::string {
  1070. using T = std::decay_t<decltype(p)>;
  1071. if constexpr (std::is_same_v<T, common_peg_epsilon_parser> ||
  1072. std::is_same_v<T, common_peg_start_parser> ||
  1073. std::is_same_v<T, common_peg_end_parser>) {
  1074. return "";
  1075. } else if constexpr (std::is_same_v<T, common_peg_literal_parser>) {
  1076. return gbnf_format_literal(p.literal);
  1077. } else if constexpr (std::is_same_v<T, common_peg_sequence_parser>) {
  1078. std::string s;
  1079. for (const auto & child : p.children) {
  1080. if (!s.empty()) {
  1081. s += " ";
  1082. }
  1083. auto child_gbnf = to_gbnf(child);
  1084. const auto & child_parser = parsers_.at(child);
  1085. if (std::holds_alternative<common_peg_choice_parser>(child_parser) ||
  1086. std::holds_alternative<common_peg_sequence_parser>(child_parser)) {
  1087. s += "(" + child_gbnf + ")";
  1088. } else {
  1089. s += child_gbnf;
  1090. }
  1091. }
  1092. return s;
  1093. } else if constexpr (std::is_same_v<T, common_peg_choice_parser>) {
  1094. std::string s;
  1095. for (const auto & child : p.children) {
  1096. if (!s.empty()) {
  1097. s += " | ";
  1098. }
  1099. auto child_gbnf = to_gbnf(child);
  1100. const auto & child_parser = parsers_.at(child);
  1101. if (std::holds_alternative<common_peg_choice_parser>(child_parser)) {
  1102. s += "(" + child_gbnf + ")";
  1103. } else {
  1104. s += child_gbnf;
  1105. }
  1106. }
  1107. return s;
  1108. } else if constexpr (std::is_same_v<T, common_peg_repetition_parser>) {
  1109. auto child_gbnf = to_gbnf(p.child);
  1110. const auto & child_parser = parsers_.at(p.child);
  1111. if (std::holds_alternative<common_peg_choice_parser>(child_parser) ||
  1112. std::holds_alternative<common_peg_sequence_parser>(child_parser)) {
  1113. child_gbnf = "(" + child_gbnf + ")";
  1114. }
  1115. if (p.min_count == 0 && p.max_count == 1) {
  1116. return child_gbnf + "?";
  1117. }
  1118. if (p.min_count == 0 && p.max_count == -1) {
  1119. return child_gbnf + "*";
  1120. }
  1121. if (p.min_count == 1 && p.max_count == -1) {
  1122. return child_gbnf + "+";
  1123. }
  1124. if (p.max_count == -1) {
  1125. return child_gbnf + "{" + std::to_string(p.min_count) + ",}";
  1126. }
  1127. if (p.min_count == p.max_count) {
  1128. if (p.min_count == 1) {
  1129. return child_gbnf;
  1130. }
  1131. return child_gbnf + "{" + std::to_string(p.min_count) + "}";
  1132. }
  1133. return child_gbnf + "{" + std::to_string(p.min_count) + "," + std::to_string(p.max_count) + "}";
  1134. } else if constexpr (std::is_same_v<T, common_peg_and_parser> || std::is_same_v<T, common_peg_not_parser>) {
  1135. return ""; // Lookahead not supported in GBNF
  1136. } else if constexpr (std::is_same_v<T, common_peg_any_parser>) {
  1137. return ".";
  1138. } else if constexpr (std::is_same_v<T, common_peg_space_parser>) {
  1139. return "space";
  1140. } else if constexpr (std::is_same_v<T, common_peg_chars_parser>) {
  1141. std::string result = p.pattern;
  1142. if (p.min_count == 0 && p.max_count == 1) {
  1143. return result + "?";
  1144. }
  1145. if (p.min_count == 0 && p.max_count == -1) {
  1146. return result + "*";
  1147. }
  1148. if (p.min_count == 1 && p.max_count == -1) {
  1149. return result + "+";
  1150. }
  1151. if (p.max_count == -1) {
  1152. return result + "{" + std::to_string(p.min_count) + ",}";
  1153. }
  1154. if (p.min_count == p.max_count) {
  1155. if (p.min_count == 1) {
  1156. return result;
  1157. }
  1158. return result + "{" + std::to_string(p.min_count) + "}";
  1159. }
  1160. return result + "{" + std::to_string(p.min_count) + "," + std::to_string(p.max_count) + "}";
  1161. } else if constexpr (std::is_same_v<T, common_peg_json_string_parser>) {
  1162. return R"(( [^"\\] | "\\" ( ["\\/ bfnrt] | "u" [0-9a-fA-F]{4} ) )*)";
  1163. } else if constexpr (std::is_same_v<T, common_peg_until_parser>) {
  1164. if (p.delimiters.empty()) {
  1165. return ".*";
  1166. }
  1167. return gbnf_excluding_pattern(p.delimiters);
  1168. } else if constexpr (std::is_same_v<T, common_peg_schema_parser>) {
  1169. if (p.schema) {
  1170. if (p.raw && p.schema->contains("type") && p.schema->at("type").is_string() && p.schema->at("type") == "string") {
  1171. // TODO: Implement more comprehensive grammar generation for raw strings.
  1172. // For now, use the grammar emitted from the underlying parser.
  1173. return to_gbnf(p.child);
  1174. }
  1175. return builder.add_schema(p.name, *p.schema);
  1176. }
  1177. return to_gbnf(p.child);
  1178. } else if constexpr (std::is_same_v<T, common_peg_rule_parser>) {
  1179. return p.name;
  1180. } else if constexpr (std::is_same_v<T, common_peg_ref_parser>) {
  1181. // Refs should not exist after flattening, but kept just in case
  1182. return p.name;
  1183. } else if constexpr (std::is_same_v<T, common_peg_tag_parser>) {
  1184. return to_gbnf(p.child);
  1185. } else if constexpr (std::is_same_v<T, common_peg_atomic_parser>) {
  1186. return to_gbnf(p.child);
  1187. } else {
  1188. static_assert(is_always_false_v<T>);
  1189. }
  1190. }, parser);
  1191. };
  1192. // Collect reachable rules
  1193. std::unordered_set<std::string> reachable_rules;
  1194. if (lazy) {
  1195. // Collect rules reachable from trigger rules
  1196. for (const auto & [name, id] : rules_) {
  1197. const auto & parser = parsers_.at(id);
  1198. if (auto rule = std::get_if<common_peg_rule_parser>(&parser)) {
  1199. if (rule->trigger) {
  1200. // Mark trigger as reachable and visit it
  1201. reachable_rules.insert(name);
  1202. auto add_rules = collect_reachable_rules(*this, id);
  1203. reachable_rules.insert(add_rules.begin(), add_rules.end());
  1204. }
  1205. }
  1206. }
  1207. } else {
  1208. // Collect rules reachable from root
  1209. reachable_rules = collect_reachable_rules(*this, root_);
  1210. }
  1211. // Create GBNF rules for all reachable rules
  1212. for (const auto & [name, rule_id] : rules_) {
  1213. if (reachable_rules.find(name) == reachable_rules.end()) {
  1214. continue;
  1215. }
  1216. const auto & parser = parsers_.at(rule_id);
  1217. if (auto rule = std::get_if<common_peg_rule_parser>(&parser)) {
  1218. builder.add_rule(rule->name, to_gbnf(rule->child));
  1219. }
  1220. }
  1221. if (lazy) {
  1222. // Generate root rule from trigger rules only
  1223. std::vector<std::string> trigger_names;
  1224. for (const auto & [name, rule_id] : rules_) {
  1225. const auto & parser = parsers_.at(rule_id);
  1226. if (auto rule = std::get_if<common_peg_rule_parser>(&parser)) {
  1227. if (rule->trigger) {
  1228. trigger_names.push_back(rule->name);
  1229. }
  1230. }
  1231. }
  1232. // Sort for predictable order
  1233. std::sort(trigger_names.begin(), trigger_names.end());
  1234. builder.add_rule("root", string_join(trigger_names, " | "));
  1235. } else if (root_ != COMMON_PEG_INVALID_PARSER_ID) {
  1236. builder.add_rule("root", to_gbnf(root_));
  1237. }
  1238. }
  1239. static nlohmann::json serialize_parser_variant(const common_peg_parser_variant & variant) {
  1240. using json = nlohmann::json;
  1241. return std::visit([](const auto & p) -> json {
  1242. using T = std::decay_t<decltype(p)>;
  1243. if constexpr (std::is_same_v<T, common_peg_epsilon_parser>) {
  1244. return json{{"type", "epsilon"}};
  1245. } else if constexpr (std::is_same_v<T, common_peg_start_parser>) {
  1246. return json{{"type", "start"}};
  1247. } else if constexpr (std::is_same_v<T, common_peg_end_parser>) {
  1248. return json{{"type", "end"}};
  1249. } else if constexpr (std::is_same_v<T, common_peg_literal_parser>) {
  1250. return json{{"type", "literal"}, {"literal", p.literal}};
  1251. } else if constexpr (std::is_same_v<T, common_peg_sequence_parser>) {
  1252. return json{{"type", "sequence"}, {"children", p.children}};
  1253. } else if constexpr (std::is_same_v<T, common_peg_choice_parser>) {
  1254. return json{{"type", "choice"}, {"children", p.children}};
  1255. } else if constexpr (std::is_same_v<T, common_peg_repetition_parser>) {
  1256. return json{
  1257. {"type", "repetition"},
  1258. {"child", p.child},
  1259. {"min_count", p.min_count},
  1260. {"max_count", p.max_count}
  1261. };
  1262. } else if constexpr (std::is_same_v<T, common_peg_and_parser>) {
  1263. return json{{"type", "and"}, {"child", p.child}};
  1264. } else if constexpr (std::is_same_v<T, common_peg_not_parser>) {
  1265. return json{{"type", "not"}, {"child", p.child}};
  1266. } else if constexpr (std::is_same_v<T, common_peg_any_parser>) {
  1267. return json{{"type", "any"}};
  1268. } else if constexpr (std::is_same_v<T, common_peg_space_parser>) {
  1269. return json{{"type", "space"}};
  1270. } else if constexpr (std::is_same_v<T, common_peg_chars_parser>) {
  1271. json ranges = json::array();
  1272. for (const auto & range : p.ranges) {
  1273. ranges.push_back({{"start", range.start}, {"end", range.end}});
  1274. }
  1275. return json{
  1276. {"type", "chars"},
  1277. {"pattern", p.pattern},
  1278. {"ranges", ranges},
  1279. {"negated", p.negated},
  1280. {"min_count", p.min_count},
  1281. {"max_count", p.max_count}
  1282. };
  1283. } else if constexpr (std::is_same_v<T, common_peg_json_string_parser>) {
  1284. return json{{"type", "json_string"}};
  1285. } else if constexpr (std::is_same_v<T, common_peg_until_parser>) {
  1286. return json{{"type", "until"}, {"delimiters", p.delimiters}};
  1287. } else if constexpr (std::is_same_v<T, common_peg_schema_parser>) {
  1288. return json{
  1289. {"type", "schema"},
  1290. {"child", p.child},
  1291. {"name", p.name},
  1292. {"schema", p.schema ? *p.schema : nullptr},
  1293. {"raw", p.raw}
  1294. };
  1295. } else if constexpr (std::is_same_v<T, common_peg_rule_parser>) {
  1296. return json{
  1297. {"type", "rule"},
  1298. {"name", p.name},
  1299. {"child", p.child},
  1300. {"trigger", p.trigger}
  1301. };
  1302. } else if constexpr (std::is_same_v<T, common_peg_ref_parser>) {
  1303. return json{{"type", "ref"}, {"name", p.name}};
  1304. } else if constexpr (std::is_same_v<T, common_peg_atomic_parser>) {
  1305. return json{{"type", "atomic"}, {"child", p.child}};
  1306. } else if constexpr (std::is_same_v<T, common_peg_tag_parser>) {
  1307. return json{
  1308. {"type", "tag"},
  1309. {"child", p.child},
  1310. {"tag", p.tag}
  1311. };
  1312. }
  1313. }, variant);
  1314. }
  1315. nlohmann::json common_peg_arena::to_json() const {
  1316. auto parsers = nlohmann::json::array();
  1317. for (const auto & parser : parsers_) {
  1318. parsers.push_back(serialize_parser_variant(parser));
  1319. }
  1320. return nlohmann::json{
  1321. {"parsers", parsers},
  1322. {"rules", rules_},
  1323. {"root", root_}
  1324. };
  1325. }
  1326. static common_peg_parser_variant deserialize_parser_variant(const nlohmann::json & j) {
  1327. if (!j.contains("type") || !j["type"].is_string()) {
  1328. throw std::runtime_error("Parser variant JSON missing or invalid 'type' field");
  1329. }
  1330. std::string type = j["type"];
  1331. if (type == "epsilon") {
  1332. return common_peg_epsilon_parser{};
  1333. }
  1334. if (type == "start") {
  1335. return common_peg_start_parser{};
  1336. }
  1337. if (type == "end") {
  1338. return common_peg_end_parser{};
  1339. }
  1340. if (type == "literal") {
  1341. if (!j.contains("literal") || !j["literal"].is_string()) {
  1342. throw std::runtime_error("literal parser missing or invalid 'literal' field");
  1343. }
  1344. return common_peg_literal_parser{j["literal"]};
  1345. }
  1346. if (type == "sequence") {
  1347. if (!j.contains("children") || !j["children"].is_array()) {
  1348. throw std::runtime_error("sequence parser missing or invalid 'children' field");
  1349. }
  1350. return common_peg_sequence_parser{j["children"].get<std::vector<common_peg_parser_id>>()};
  1351. }
  1352. if (type == "choice") {
  1353. if (!j.contains("children") || !j["children"].is_array()) {
  1354. throw std::runtime_error("choice parser missing or invalid 'children' field");
  1355. }
  1356. return common_peg_choice_parser{j["children"].get<std::vector<common_peg_parser_id>>()};
  1357. }
  1358. if (type == "repetition") {
  1359. if (!j.contains("child") || !j.contains("min_count") || !j.contains("max_count")) {
  1360. throw std::runtime_error("repetition parser missing required fields");
  1361. }
  1362. return common_peg_repetition_parser{
  1363. j["child"].get<common_peg_parser_id>(),
  1364. j["min_count"].get<int>(),
  1365. j["max_count"].get<int>()
  1366. };
  1367. }
  1368. if (type == "and") {
  1369. if (!j.contains("child")) {
  1370. throw std::runtime_error("and parser missing 'child' field");
  1371. }
  1372. return common_peg_and_parser{j["child"].get<common_peg_parser_id>()};
  1373. }
  1374. if (type == "not") {
  1375. if (!j.contains("child")) {
  1376. throw std::runtime_error("not parser missing 'child' field");
  1377. }
  1378. return common_peg_not_parser{j["child"].get<common_peg_parser_id>()};
  1379. }
  1380. if (type == "any") {
  1381. return common_peg_any_parser{};
  1382. }
  1383. if (type == "space") {
  1384. return common_peg_space_parser{};
  1385. }
  1386. if (type == "chars") {
  1387. if (!j.contains("pattern") || !j.contains("ranges") || !j.contains("negated") ||
  1388. !j.contains("min_count") || !j.contains("max_count")) {
  1389. throw std::runtime_error("chars parser missing required fields");
  1390. }
  1391. common_peg_chars_parser parser;
  1392. parser.pattern = j["pattern"];
  1393. parser.negated = j["negated"];
  1394. parser.min_count = j["min_count"];
  1395. parser.max_count = j["max_count"];
  1396. for (const auto & range_json : j["ranges"]) {
  1397. if (!range_json.contains("start") || !range_json.contains("end")) {
  1398. throw std::runtime_error("char_range missing 'start' or 'end' field");
  1399. }
  1400. parser.ranges.push_back({
  1401. range_json["start"].get<uint32_t>(),
  1402. range_json["end"].get<uint32_t>()
  1403. });
  1404. }
  1405. return parser;
  1406. }
  1407. if (type == "json_string") {
  1408. return common_peg_json_string_parser{};
  1409. }
  1410. if (type == "until") {
  1411. if (!j.contains("delimiters") || !j["delimiters"].is_array()) {
  1412. throw std::runtime_error("until parser missing or invalid 'delimiters' field");
  1413. }
  1414. return common_peg_until_parser{j["delimiters"].get<std::vector<std::string>>()};
  1415. }
  1416. if (type == "schema") {
  1417. if (!j.contains("child") || !j.contains("name") || !j.contains("schema") || !j.contains("raw")) {
  1418. throw std::runtime_error("schema parser missing required fields");
  1419. }
  1420. common_peg_schema_parser parser;
  1421. parser.child = j["child"].get<common_peg_parser_id>();
  1422. parser.name = j["name"];
  1423. if (!j["schema"].is_null()) {
  1424. parser.schema = std::make_shared<nlohmann::ordered_json>(j["schema"]);
  1425. }
  1426. parser.raw = j["raw"].get<bool>();
  1427. return parser;
  1428. }
  1429. if (type == "rule") {
  1430. if (!j.contains("name") || !j.contains("child") || !j.contains("trigger")) {
  1431. throw std::runtime_error("rule parser missing required fields");
  1432. }
  1433. return common_peg_rule_parser{
  1434. j["name"].get<std::string>(),
  1435. j["child"].get<common_peg_parser_id>(),
  1436. j["trigger"].get<bool>()
  1437. };
  1438. }
  1439. if (type == "ref") {
  1440. if (!j.contains("name") || !j["name"].is_string()) {
  1441. throw std::runtime_error("ref parser missing or invalid 'name' field");
  1442. }
  1443. return common_peg_ref_parser{j["name"]};
  1444. }
  1445. if (type == "atomic") {
  1446. if (!j.contains("child")) {
  1447. throw std::runtime_error("tag parser missing required fields");
  1448. }
  1449. return common_peg_atomic_parser{
  1450. j["child"].get<common_peg_parser_id>(),
  1451. };
  1452. }
  1453. if (type == "tag") {
  1454. if (!j.contains("child") || !j.contains("tag")) {
  1455. throw std::runtime_error("tag parser missing required fields");
  1456. }
  1457. return common_peg_tag_parser{
  1458. j["child"].get<common_peg_parser_id>(),
  1459. j["tag"].get<std::string>(),
  1460. };
  1461. }
  1462. throw std::runtime_error("Unknown parser type: " + type);
  1463. }
  1464. common_peg_arena common_peg_arena::from_json(const nlohmann::json & j) {
  1465. if (!j.contains("parsers") || !j["parsers"].is_array()) {
  1466. throw std::runtime_error("JSON missing or invalid 'parsers' array");
  1467. }
  1468. if (!j.contains("rules") || !j["rules"].is_object()) {
  1469. throw std::runtime_error("JSON missing or invalid 'rules' object");
  1470. }
  1471. if (!j.contains("root")) {
  1472. throw std::runtime_error("JSON missing 'root' field");
  1473. }
  1474. common_peg_arena arena;
  1475. const auto & parsers_json = j["parsers"];
  1476. arena.parsers_.reserve(parsers_json.size());
  1477. for (const auto & parser_json : parsers_json) {
  1478. arena.parsers_.push_back(deserialize_parser_variant(parser_json));
  1479. }
  1480. arena.rules_ = j["rules"].get<std::unordered_map<std::string, common_peg_parser_id>>();
  1481. for (const auto & [name, id] : arena.rules_) {
  1482. if (id >= arena.parsers_.size()) {
  1483. throw std::runtime_error("Rule '" + name + "' references invalid parser ID: " + std::to_string(id));
  1484. }
  1485. }
  1486. arena.root_ = j["root"].get<common_peg_parser_id>();
  1487. if (arena.root_ != COMMON_PEG_INVALID_PARSER_ID && arena.root_ >= arena.parsers_.size()) {
  1488. throw std::runtime_error("Root references invalid parser ID: " + std::to_string(arena.root_));
  1489. }
  1490. return arena;
  1491. }
  1492. std::string common_peg_arena::save() const {
  1493. return to_json().dump();
  1494. }
  1495. void common_peg_arena::load(const std::string & data) {
  1496. *this = from_json(nlohmann::json::parse(data));
  1497. }
  1498. common_peg_arena build_peg_parser(const std::function<common_peg_parser(common_peg_parser_builder & builder)> & fn) {
  1499. common_peg_parser_builder builder;
  1500. builder.set_root(fn(builder));
  1501. return builder.build();
  1502. }