json-schema-to-grammar.py 4.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132
  1. import argparse
  2. import json
  3. import re
  4. import sys
  5. # whitespace is constrained to a single space char to prevent model "running away" in
  6. # whitespace. Also maybe improves generation quality?
  7. SPACE_RULE = '" "?'
  8. PRIMITIVE_RULES = {
  9. 'boolean': '("true" | "false") space',
  10. 'number': '("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? space',
  11. 'integer': '("-"? ([0-9] | [1-9] [0-9]*)) space',
  12. 'string': r''' "\"" (
  13. [^"\\] |
  14. "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
  15. )* "\"" space ''',
  16. 'null': '"null" space',
  17. }
  18. INVALID_RULE_CHARS_RE = re.compile(r'[^a-zA-Z0-9-]+')
  19. GRAMMAR_LITERAL_ESCAPE_RE = re.compile(r'[\r\n"]')
  20. GRAMMAR_LITERAL_ESCAPES = {'\r': '\\r', '\n': '\\n', '"': '\\"'}
  21. class SchemaConverter:
  22. def __init__(self, prop_order):
  23. self._prop_order = prop_order
  24. self._rules = {'space': SPACE_RULE}
  25. def _format_literal(self, literal):
  26. escaped = GRAMMAR_LITERAL_ESCAPE_RE.sub(
  27. lambda m: GRAMMAR_LITERAL_ESCAPES.get(m.group(0)), json.dumps(literal)
  28. )
  29. return f'"{escaped}"'
  30. def _add_rule(self, name, rule):
  31. esc_name = INVALID_RULE_CHARS_RE.sub('-', name)
  32. if esc_name not in self._rules or self._rules[esc_name] == rule:
  33. key = esc_name
  34. else:
  35. i = 0
  36. while f'{esc_name}{i}' in self._rules:
  37. i += 1
  38. key = f'{esc_name}{i}'
  39. self._rules[key] = rule
  40. return key
  41. def visit(self, schema, name):
  42. schema_type = schema.get('type')
  43. rule_name = name or 'root'
  44. if 'oneOf' in schema or 'anyOf' in schema:
  45. rule = ' | '.join((
  46. self.visit(alt_schema, f'{name}{"-" if name else ""}{i}')
  47. for i, alt_schema in enumerate(schema.get('oneOf') or schema['anyOf'])
  48. ))
  49. return self._add_rule(rule_name, rule)
  50. elif 'const' in schema:
  51. return self._add_rule(rule_name, self._format_literal(schema['const']))
  52. elif 'enum' in schema:
  53. rule = ' | '.join((self._format_literal(v) for v in schema['enum']))
  54. return self._add_rule(rule_name, rule)
  55. elif schema_type == 'object' and 'properties' in schema:
  56. # TODO: `required` keyword
  57. prop_order = self._prop_order
  58. prop_pairs = sorted(
  59. schema['properties'].items(),
  60. # sort by position in prop_order (if specified) then by key
  61. key=lambda kv: (prop_order.get(kv[0], len(prop_order)), kv[0]),
  62. )
  63. rule = '"{" space'
  64. for i, (prop_name, prop_schema) in enumerate(prop_pairs):
  65. prop_rule_name = self.visit(prop_schema, f'{name}{"-" if name else ""}{prop_name}')
  66. if i > 0:
  67. rule += ' "," space'
  68. rule += fr' {self._format_literal(prop_name)} space ":" space {prop_rule_name}'
  69. rule += ' "}" space'
  70. return self._add_rule(rule_name, rule)
  71. elif schema_type == 'array' and 'items' in schema:
  72. # TODO `prefixItems` keyword
  73. item_rule_name = self.visit(schema['items'], f'{name}{"-" if name else ""}item')
  74. rule = f'"[" space ({item_rule_name} ("," space {item_rule_name})*)? "]" space'
  75. return self._add_rule(rule_name, rule)
  76. else:
  77. assert schema_type in PRIMITIVE_RULES, f'Unrecognized schema: {schema}'
  78. return self._add_rule(
  79. 'root' if rule_name == 'root' else schema_type,
  80. PRIMITIVE_RULES[schema_type]
  81. )
  82. def format_grammar(self):
  83. return '\n'.join((f'{name} ::= {rule}' for name, rule in self._rules.items()))
  84. def main(args_in = None):
  85. parser = argparse.ArgumentParser(
  86. description='''
  87. Generates a grammar (suitable for use in ./main) that produces JSON conforming to a
  88. given JSON schema. Only a subset of JSON schema features are supported; more may be
  89. added in the future.
  90. ''',
  91. )
  92. parser.add_argument(
  93. '--prop-order',
  94. default=[],
  95. type=lambda s: s.split(','),
  96. help='''
  97. comma-separated property names defining the order of precedence for object properties;
  98. properties not specified here are given lower precedence than those that are, and are
  99. sorted alphabetically
  100. '''
  101. )
  102. parser.add_argument('schema', help='file containing JSON schema ("-" for stdin)')
  103. args = parser.parse_args(args_in)
  104. schema = json.load(sys.stdin if args.schema == '-' else open(args.schema))
  105. prop_order = {name: idx for idx, name in enumerate(args.prop_order)}
  106. converter = SchemaConverter(prop_order)
  107. converter.visit(schema, '')
  108. print(converter.format_grammar())
  109. if __name__ == '__main__':
  110. main()