json-schema-to-grammar.mjs 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538
  1. // WARNING: This file was ported from json-schema-to-grammar.py, please fix bugs / add features there first.
  2. const SPACE_RULE = '" "?';
  3. const PRIMITIVE_RULES = {
  4. boolean: '("true" | "false") space',
  5. number: '("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? space',
  6. integer: '("-"? ([0-9] | [1-9] [0-9]*)) space',
  7. value: 'object | array | string | number | boolean',
  8. object: '"{" space ( string ":" space value ("," space string ":" space value)* )? "}" space',
  9. array: '"[" space ( value ("," space value)* )? "]" space',
  10. uuid: '"\\"" ' + [8, 4, 4, 4, 12].map(n => [...new Array(n)].map(_ => '[0-9a-fA-F]').join('')).join(' "-" ') + ' "\\"" space',
  11. string: ` "\\"" (
  12. [^"\\\\] |
  13. "\\\\" (["\\\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
  14. )* "\\"" space`,
  15. null: '"null" space',
  16. };
  17. const OBJECT_RULE_NAMES = ['object', 'array', 'string', 'number', 'boolean', 'null', 'value'];
  18. // TODO: support "uri", "email" string formats
  19. const DATE_RULES = {
  20. 'date' : '[0-9] [0-9] [0-9] [0-9] "-" ( "0" [1-9] | "1" [0-2] ) "-" ( \"0\" [1-9] | [1-2] [0-9] | "3" [0-1] )',
  21. 'time' : '([01] [0-9] | "2" [0-3]) ":" [0-5] [0-9] ":" [0-5] [0-9] ( "." [0-9] [0-9] [0-9] )? ( "Z" | ( "+" | "-" ) ( [01] [0-9] | "2" [0-3] ) ":" [0-5] [0-9] )',
  22. 'date-time': 'date "T" time',
  23. 'date-string': '"\\"" date "\\"" space',
  24. 'time-string': '"\\"" time "\\"" space',
  25. 'date-time-string': '"\\"" date-time "\\"" space',
  26. };
  27. const RESERVED_NAMES = {'root': true, ...PRIMITIVE_RULES, ...DATE_RULES};
  28. const INVALID_RULE_CHARS_RE = /[^\dA-Za-z-]+/g;
  29. const GRAMMAR_LITERAL_ESCAPE_RE = /[\n\r"]/g;
  30. const GRAMMAR_RANGE_LITERAL_ESCAPE_RE = /[\n\r"\]\-\\]/g;
  31. const GRAMMAR_LITERAL_ESCAPES = { '\r': '\\r', '\n': '\\n', '"': '\\"', '-': '\\-', ']': '\\]' };
  32. const NON_LITERAL_SET = new Set('|.()[]{}*+?');
  33. const ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = new Set('[]()|{}*+?');
  34. export class SchemaConverter {
  35. constructor(options) {
  36. this._propOrder = options.prop_order || {};
  37. this._allowFetch = options.allow_fetch || false;
  38. this._dotall = options.dotall || false;
  39. this._rules = {'space': SPACE_RULE};
  40. this._refs = {};
  41. this._refsBeingResolved = new Set();
  42. }
  43. _formatLiteral(literal) {
  44. const escaped = literal.replace(
  45. GRAMMAR_LITERAL_ESCAPE_RE,
  46. m => GRAMMAR_LITERAL_ESCAPES[m]
  47. );
  48. return `"${escaped}"`;
  49. }
  50. _formatRangeChar(literal) {
  51. return JSON.stringify(literal).slice(1, -1).replace(
  52. GRAMMAR_RANGE_LITERAL_ESCAPE_RE,
  53. m => GRAMMAR_LITERAL_ESCAPES[m]
  54. );
  55. }
  56. _addRule(name, rule) {
  57. let escName = name.replace(INVALID_RULE_CHARS_RE, '-');
  58. let key = escName;
  59. if (escName in this._rules) {
  60. if (this._rules[escName] === rule) {
  61. return key;
  62. }
  63. let i = 0;
  64. while ((`${escName}${i}` in this._rules) && (this._rules[`${escName}${i}`] !== rule)) {
  65. i += 1;
  66. }
  67. key = `${escName}${i}`;
  68. }
  69. this._rules[key] = rule;
  70. return key;
  71. }
  72. async resolveRefs(schema, url) {
  73. const visit = async (n) => {
  74. if (Array.isArray(n)) {
  75. return Promise.all(n.map(visit));
  76. } else if (typeof n === 'object' && n !== null) {
  77. let ref = n.$ref;
  78. let target;
  79. if (ref !== undefined && !this._refs[ref]) {
  80. if (ref.startsWith('https://')) {
  81. if (!this._allowFetch) {
  82. throw new Error('Fetching remote schemas is not allowed (use --allow-fetch for force)');
  83. }
  84. const fetch = (await import('node-fetch')).default;
  85. const fragSplit = ref.split('#');
  86. const baseUrl = fragSplit[0];
  87. target = this._refs[baseUrl];
  88. if (!target) {
  89. target = await this.resolveRefs(await fetch(ref).then(res => res.json()), baseUrl);
  90. this._refs[baseUrl] = target;
  91. }
  92. if (fragSplit.length === 1 || fragSplit[fragSplit.length - 1] === '') {
  93. return target;
  94. }
  95. } else if (ref.startsWith('#/')) {
  96. target = schema;
  97. ref = `${url}${ref}`;
  98. n.$ref = ref;
  99. } else {
  100. throw new Error(`Unsupported ref ${ref}`);
  101. }
  102. const selectors = ref.split('#')[1].split('/').slice(1);
  103. for (const sel of selectors) {
  104. if (!target || !(sel in target)) {
  105. throw new Error(`Error resolving ref ${ref}: ${sel} not in ${JSON.stringify(target)}`);
  106. }
  107. target = target[sel];
  108. }
  109. this._refs[ref] = target;
  110. } else {
  111. await Promise.all(Object.values(n).map(visit));
  112. }
  113. }
  114. return n;
  115. };
  116. return visit(schema);
  117. }
  118. _generateUnionRule(name, altSchemas) {
  119. return altSchemas
  120. .map((altSchema, i) => this.visit(altSchema, `${name ?? ''}${name ? '-' : 'alternative-'}${i}`))
  121. .join(' | ');
  122. }
  123. _visitPattern(pattern, name) {
  124. if (!pattern.startsWith('^') || !pattern.endsWith('$')) {
  125. throw new Error('Pattern must start with "^" and end with "$"');
  126. }
  127. pattern = pattern.slice(1, -1);
  128. const subRuleIds = {};
  129. let i = 0;
  130. const length = pattern.length;
  131. const getDot = () => {
  132. let rule;
  133. if (this._dotall) {
  134. rule = '[\\U00000000-\\U0010FFFF]';
  135. } else {
  136. // Accept any character... except \n and \r line break chars (\x0A and \xOD)
  137. rule = '[\\U00000000-\\x09\\x0B\\x0C\\x0E-\\U0010FFFF]';
  138. }
  139. return this._addRule('dot', rule);
  140. };
  141. const toRule = ([s, isLiteral]) => isLiteral ? "\"" + s + "\"" : s;
  142. const transform = () => {
  143. const start = i;
  144. // For each component of this sequence, store its string representation and whether it's a literal.
  145. // We only need a flat structure here to apply repetition operators to the last item, and
  146. // to merge literals at the and (we're parsing grouped ( sequences ) recursively and don't treat '|' specially
  147. // (GBNF's syntax is luckily very close to regular expressions!)
  148. const seq = [];
  149. const joinSeq = () => {
  150. const ret = [];
  151. for (const [isLiteral, g] of groupBy(seq, x => x[1])) {
  152. if (isLiteral) {
  153. ret.push([[...g].map(x => x[0]).join(''), true]);
  154. } else {
  155. ret.push(...g);
  156. }
  157. }
  158. if (ret.length === 1) {
  159. return ret[0];
  160. }
  161. return [ret.map(x => toRule(x)).join(' '), false];
  162. };
  163. while (i < length) {
  164. const c = pattern[i];
  165. if (c === '.') {
  166. seq.push([getDot(), false]);
  167. i += 1;
  168. } else if (c === '(') {
  169. i += 1;
  170. if (i < length) {
  171. if (pattern[i] === '?') {
  172. throw new Error(`Unsupported pattern syntax "${pattern[i]}" at index ${i} of /${pattern}/`);
  173. }
  174. }
  175. seq.push([`(${toRule(transform())})`, false]);
  176. } else if (c === ')') {
  177. i += 1;
  178. if (start <= 0 || pattern[start - 1] !== '(') {
  179. throw new Error(`Unbalanced parentheses; start = ${start}, i = ${i}, pattern = ${pattern}`);
  180. }
  181. return joinSeq();
  182. } else if (c === '[') {
  183. let squareBrackets = c;
  184. i += 1;
  185. while (i < length && pattern[i] !== ']') {
  186. if (pattern[i] === '\\') {
  187. squareBrackets += pattern.slice(i, i + 2);
  188. i += 2;
  189. } else {
  190. squareBrackets += pattern[i];
  191. i += 1;
  192. }
  193. }
  194. if (i >= length) {
  195. throw new Error(`Unbalanced square brackets; start = ${start}, i = ${i}, pattern = ${pattern}`);
  196. }
  197. squareBrackets += ']';
  198. i += 1;
  199. seq.push([squareBrackets, false]);
  200. } else if (c === '|') {
  201. seq.push(['|', false]);
  202. i += 1;
  203. } else if (c === '*' || c === '+' || c === '?') {
  204. seq[seq.length - 1] = [toRule(seq[seq.length - 1]) + c, false];
  205. i += 1;
  206. } else if (c === '{') {
  207. let curlyBrackets = c;
  208. i += 1;
  209. while (i < length && pattern[i] !== '}') {
  210. curlyBrackets += pattern[i];
  211. i += 1;
  212. }
  213. if (i >= length) {
  214. throw new Error(`Unbalanced curly brackets; start = ${start}, i = ${i}, pattern = ${pattern}`);
  215. }
  216. curlyBrackets += '}';
  217. i += 1;
  218. const nums = curlyBrackets.slice(1, -1).split(',').map(s => s.trim());
  219. let minTimes, maxTimes;
  220. if (nums.length === 1) {
  221. minTimes = parseInt(nums[0], 10);
  222. maxTimes = minTimes;
  223. } else {
  224. if (nums.length !== 2) {
  225. throw new Error(`Invalid quantifier ${curlyBrackets}`);
  226. }
  227. minTimes = nums[0] ? parseInt(nums[0], 10) : 0;
  228. maxTimes = nums[1] ? parseInt(nums[1], 10) : Infinity;
  229. }
  230. let [sub, subIsLiteral] = seq[seq.length - 1];
  231. if (minTimes === 0 && maxTimes === Infinity) {
  232. seq[seq.length - 1] = [`${sub}*`, false];
  233. } else if (minTimes === 0 && maxTimes === 1) {
  234. seq[seq.length - 1] = [`${sub}?`, false];
  235. } else if (minTimes === 1 && maxTimes === Infinity) {
  236. seq[seq.length - 1] = [`${sub}+`, false];
  237. } else {
  238. if (!subIsLiteral) {
  239. let id = subRuleIds[sub];
  240. if (id === undefined) {
  241. id = this._addRule(`${name}-${Object.keys(subRuleIds).length + 1}`, sub);
  242. subRuleIds[sub] = id;
  243. }
  244. sub = id;
  245. }
  246. const repeatedSub = Array.from({ length: minTimes }, () => subIsLiteral ? `"${sub.slice(1, -1).repeat(minTimes)}"` : sub);
  247. const optionalSub = maxTimes !== undefined ? Array.from({ length: maxTimes - minTimes }, () => `${sub}?`) : [`${sub}*`];
  248. seq[seq.length - 1] = [repeatedSub.concat(optionalSub).join(' '), false];
  249. }
  250. } else {
  251. let literal = '';
  252. while (i < length) {
  253. if (pattern[i] === '\\' && i < length - 1) {
  254. const next = pattern[i + 1];
  255. if (ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS.has(next)) {
  256. i += 1;
  257. literal += pattern[i];
  258. i += 1;
  259. } else {
  260. literal += pattern.slice(i, i + 2);
  261. i += 2;
  262. }
  263. } else if (pattern[i] === '"') {
  264. literal += '\\"';
  265. i += 1;
  266. } else if (!NON_LITERAL_SET.has(pattern[i]) &&
  267. (i === length - 1 || literal === '' || pattern[i + 1] === '.' || !NON_LITERAL_SET.has(pattern[i+1]))) {
  268. literal += pattern[i];
  269. i += 1;
  270. } else {
  271. break;
  272. }
  273. }
  274. if (literal !== '') {
  275. seq.push([literal, true]);
  276. }
  277. }
  278. }
  279. return joinSeq();
  280. };
  281. return this._addRule(name, "\"\\\"\" " + toRule(transform()) + " \"\\\"\" space")
  282. }
  283. _resolveRef(ref) {
  284. let refName = ref.split('/').pop();
  285. if (!(refName in this._rules) && !this._refsBeingResolved.has(ref)) {
  286. this._refsBeingResolved.add(ref);
  287. const resolved = this._refs[ref];
  288. refName = this.visit(resolved, refName);
  289. this._refsBeingResolved.delete(ref);
  290. }
  291. return refName;
  292. }
  293. _generateConstantRule(value) {
  294. return this._formatLiteral(JSON.stringify(value));
  295. }
  296. visit(schema, name) {
  297. const schemaType = schema.type;
  298. const schemaFormat = schema.format;
  299. const ruleName = name in RESERVED_NAMES ? name + '-' : name == '' ? 'root' : name;
  300. const ref = schema.$ref;
  301. if (ref !== undefined) {
  302. return this._addRule(ruleName, this._resolveRef(ref));
  303. } else if (schema.oneOf || schema.anyOf) {
  304. return this._addRule(ruleName, this._generateUnionRule(name, schema.oneOf || schema.anyOf));
  305. } else if (Array.isArray(schemaType)) {
  306. return this._addRule(ruleName, this._generateUnionRule(name, schemaType.map(t => ({ type: t }))));
  307. } else if ('const' in schema) {
  308. return this._addRule(ruleName, this._generateConstantRule(schema.const));
  309. } else if ('enum' in schema) {
  310. const rule = schema.enum.map(v => this._generateConstantRule(v)).join(' | ');
  311. return this._addRule(ruleName, rule);
  312. } else if ((schemaType === undefined || schemaType === 'object') &&
  313. ('properties' in schema ||
  314. ('additionalProperties' in schema && schema.additionalProperties !== true))) {
  315. const required = new Set(schema.required || []);
  316. const properties = Object.entries(schema.properties ?? {});
  317. return this._addRule(ruleName, this._buildObjectRule(properties, required, name, schema.additionalProperties));
  318. } else if ((schemaType === undefined || schemaType === 'object') && 'allOf' in schema) {
  319. const required = new Set();
  320. const properties = [];
  321. const addComponent = (compSchema, isRequired) => {
  322. const ref = compSchema.$ref;
  323. if (ref !== undefined) {
  324. compSchema = this._refs[ref];
  325. }
  326. if ('properties' in compSchema) {
  327. for (const [propName, propSchema] of Object.entries(compSchema.properties)) {
  328. properties.push([propName, propSchema]);
  329. if (isRequired) {
  330. required.add(propName);
  331. }
  332. }
  333. }
  334. };
  335. for (const t of schema.allOf) {
  336. if ('anyOf' in t) {
  337. for (const tt of t.anyOf) {
  338. addComponent(tt, false);
  339. }
  340. } else {
  341. addComponent(t, true);
  342. }
  343. }
  344. return this._addRule(ruleName, this._buildObjectRule(properties, required, name, /* additionalProperties= */ false));
  345. } else if ((schemaType === undefined || schemaType === 'array') && ('items' in schema || 'prefixItems' in schema)) {
  346. const items = schema.items ?? schema.prefixItems;
  347. if (Array.isArray(items)) {
  348. return this._addRule(
  349. ruleName,
  350. '"[" space ' +
  351. items.map((item, i) => this.visit(item, `${name ?? ''}${name ? '-' : ''}tuple-${i}`)).join(' "," space ') +
  352. ' "]" space'
  353. );
  354. } else {
  355. const itemRuleName = this.visit(items, `${name ?? ''}${name ? '-' : ''}item`);
  356. const listItemOperator = `( "," space ${itemRuleName} )`;
  357. let successiveItems = '';
  358. let minItems = schema.minItems || 0;
  359. const maxItems = schema.maxItems;
  360. if (minItems > 0) {
  361. successiveItems = listItemOperator.repeat(minItems - 1);
  362. minItems--;
  363. }
  364. if (maxItems !== undefined && maxItems > minItems) {
  365. successiveItems += `${listItemOperator}?`.repeat(maxItems - minItems - 1);
  366. } else {
  367. successiveItems += `${listItemOperator}*`;
  368. }
  369. const rule = minItems === 0
  370. ? `"[" space ( ${itemRuleName} ${successiveItems} )? "]" space`
  371. : `"[" space ${itemRuleName} ${successiveItems} "]" space`;
  372. return this._addRule(ruleName, rule);
  373. }
  374. } else if ((schemaType === undefined || schemaType === 'string') && 'pattern' in schema) {
  375. return this._visitPattern(schema.pattern, ruleName);
  376. } else if ((schemaType === undefined || schemaType === 'string') && /^uuid[1-5]?$/.test(schema.format || '')) {
  377. return this._addRule(
  378. ruleName === 'root' ? 'root' : schemaFormat,
  379. PRIMITIVE_RULES['uuid'])
  380. } else if ((schemaType === undefined || schemaType === 'string') && schema.format in DATE_RULES) {
  381. for (const [t, r] of Object.entries(DATE_RULES)) {
  382. this._addRule(t, r);
  383. }
  384. return schemaFormat + '-string';
  385. } else if ((schemaType === 'object') || (Object.keys(schema).length === 0)) {
  386. for (const n of OBJECT_RULE_NAMES) {
  387. this._addRule(n, PRIMITIVE_RULES[n]);
  388. }
  389. return this._addRule(ruleName, 'object');
  390. } else {
  391. if (!(schemaType in PRIMITIVE_RULES)) {
  392. throw new Error(`Unrecognized schema: ${JSON.stringify(schema)}`);
  393. }
  394. // TODO: support minimum, maximum, exclusiveMinimum, exclusiveMaximum at least for zero
  395. return this._addRule(ruleName === 'root' ? 'root' : schemaType, PRIMITIVE_RULES[schemaType]);
  396. }
  397. }
  398. _buildObjectRule(properties, required, name, additionalProperties) {
  399. const propOrder = this._propOrder;
  400. // sort by position in prop_order (if specified) then by original order
  401. const sortedProps = properties.map(([k]) => k).sort((a, b) => {
  402. const orderA = propOrder[a] || Infinity;
  403. const orderB = propOrder[b] || Infinity;
  404. return orderA - orderB || properties.findIndex(([k]) => k === a) - properties.findIndex(([k]) => k === b);
  405. });
  406. const propKvRuleNames = {};
  407. for (const [propName, propSchema] of properties) {
  408. const propRuleName = this.visit(propSchema, `${name ?? ''}${name ? '-' : ''}${propName}`);
  409. propKvRuleNames[propName] = this._addRule(
  410. `${name ?? ''}${name ? '-' : ''}${propName}-kv`,
  411. `${this._formatLiteral(JSON.stringify(propName))} space ":" space ${propRuleName}`
  412. );
  413. }
  414. const requiredProps = sortedProps.filter(k => required.has(k));
  415. const optionalProps = sortedProps.filter(k => !required.has(k));
  416. if (typeof additionalProperties === 'object' || additionalProperties === true) {
  417. const subName = `${name ?? ''}${name ? '-' : ''}additional`;
  418. const valueRule = this.visit(additionalProperties === true ? {} : additionalProperties, `${subName}-value`);
  419. propKvRuleNames['*'] = this._addRule(
  420. `${subName}-kv`,
  421. `${this._addRule('string', PRIMITIVE_RULES['string'])} ":" space ${valueRule}`);
  422. optionalProps.push('*');
  423. }
  424. let rule = '"{" space ';
  425. rule += requiredProps.map(k => propKvRuleNames[k]).join(' "," space ');
  426. if (optionalProps.length > 0) {
  427. rule += ' (';
  428. if (requiredProps.length > 0) {
  429. rule += ' "," space ( ';
  430. }
  431. const getRecursiveRefs = (ks, firstIsOptional) => {
  432. const [k, ...rest] = ks;
  433. const kvRuleName = propKvRuleNames[k];
  434. let res;
  435. if (k === '*') {
  436. res = this._addRule(
  437. `${name ?? ''}${name ? '-' : ''}additional-kvs`,
  438. `${kvRuleName} ( "," space ` + kvRuleName + ` )*`
  439. )
  440. } else if (firstIsOptional) {
  441. res = `( "," space ${kvRuleName} )?`;
  442. } else {
  443. res = kvRuleName;
  444. }
  445. if (rest.length > 0) {
  446. res += ' ' + this._addRule(
  447. `${name ?? ''}${name ? '-' : ''}${k}-rest`,
  448. getRecursiveRefs(rest, true)
  449. );
  450. }
  451. return res;
  452. };
  453. rule += optionalProps.map((_, i) => getRecursiveRefs(optionalProps.slice(i), false)).join(' | ');
  454. if (requiredProps.length > 0) {
  455. rule += ' )';
  456. }
  457. rule += ' )?';
  458. }
  459. rule += ' "}" space';
  460. return rule;
  461. }
  462. formatGrammar() {
  463. let grammar = '';
  464. for (const [name, rule] of Object.entries(this._rules).sort(([a], [b]) => a.localeCompare(b))) {
  465. grammar += `${name} ::= ${rule}\n`;
  466. }
  467. return grammar;
  468. }
  469. }
  470. // Helper function to group elements by a key function
  471. function* groupBy(iterable, keyFn) {
  472. let lastKey = null;
  473. let group = [];
  474. for (const element of iterable) {
  475. const key = keyFn(element);
  476. if (lastKey !== null && key !== lastKey) {
  477. yield [lastKey, group];
  478. group = [];
  479. }
  480. group.push(element);
  481. lastKey = key;
  482. }
  483. if (group.length > 0) {
  484. yield [lastKey, group];
  485. }
  486. }