json-schema-to-grammar.py 5.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147
  1. #!/usr/bin/env python3
  2. import argparse
  3. import json
  4. import re
  5. import sys
  6. # whitespace is constrained to a single space char to prevent model "running away" in
  7. # whitespace. Also maybe improves generation quality?
  8. SPACE_RULE = '" "?'
  9. PRIMITIVE_RULES = {
  10. 'boolean': '("true" | "false") space',
  11. 'number': '("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? space',
  12. 'integer': '("-"? ([0-9] | [1-9] [0-9]*)) space',
  13. 'string': r''' "\"" (
  14. [^"\\] |
  15. "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
  16. )* "\"" space ''',
  17. 'null': '"null" space',
  18. }
  19. INVALID_RULE_CHARS_RE = re.compile(r'[^a-zA-Z0-9-]+')
  20. GRAMMAR_LITERAL_ESCAPE_RE = re.compile(r'[\r\n"]')
  21. GRAMMAR_LITERAL_ESCAPES = {'\r': '\\r', '\n': '\\n', '"': '\\"'}
  22. class SchemaConverter:
  23. def __init__(self, prop_order):
  24. self._prop_order = prop_order
  25. self._rules = {'space': SPACE_RULE}
  26. def _format_literal(self, literal):
  27. escaped = GRAMMAR_LITERAL_ESCAPE_RE.sub(
  28. lambda m: GRAMMAR_LITERAL_ESCAPES.get(m.group(0)), json.dumps(literal)
  29. )
  30. return f'"{escaped}"'
  31. def _add_rule(self, name, rule):
  32. esc_name = INVALID_RULE_CHARS_RE.sub('-', name)
  33. if esc_name not in self._rules or self._rules[esc_name] == rule:
  34. key = esc_name
  35. else:
  36. i = 0
  37. while f'{esc_name}{i}' in self._rules:
  38. i += 1
  39. key = f'{esc_name}{i}'
  40. self._rules[key] = rule
  41. return key
  42. def visit(self, schema, name):
  43. schema_type = schema.get('type')
  44. rule_name = name or 'root'
  45. if 'oneOf' in schema or 'anyOf' in schema:
  46. rule = ' | '.join((
  47. self.visit(alt_schema, f'{name}{"-" if name else ""}{i}')
  48. for i, alt_schema in enumerate(schema.get('oneOf') or schema['anyOf'])
  49. ))
  50. return self._add_rule(rule_name, rule)
  51. elif 'const' in schema:
  52. return self._add_rule(rule_name, self._format_literal(schema['const']))
  53. elif 'enum' in schema:
  54. rule = ' | '.join((self._format_literal(v) for v in schema['enum']))
  55. return self._add_rule(rule_name, rule)
  56. elif schema_type == 'object' and 'properties' in schema:
  57. # TODO: `required` keyword
  58. prop_order = self._prop_order
  59. prop_pairs = sorted(
  60. schema['properties'].items(),
  61. # sort by position in prop_order (if specified) then by key
  62. key=lambda kv: (prop_order.get(kv[0], len(prop_order)), kv[0]),
  63. )
  64. rule = '"{" space'
  65. for i, (prop_name, prop_schema) in enumerate(prop_pairs):
  66. prop_rule_name = self.visit(prop_schema, f'{name}{"-" if name else ""}{prop_name}')
  67. if i > 0:
  68. rule += ' "," space'
  69. rule += fr' {self._format_literal(prop_name)} space ":" space {prop_rule_name}'
  70. rule += ' "}" space'
  71. return self._add_rule(rule_name, rule)
  72. elif schema_type == 'array' and 'items' in schema:
  73. # TODO `prefixItems` keyword
  74. item_rule_name = self.visit(schema['items'], f'{name}{"-" if name else ""}item')
  75. list_item_operator = f'("," space {item_rule_name})'
  76. successive_items = ""
  77. min_items = schema.get("minItems", 0)
  78. if min_items > 0:
  79. first_item = f"({item_rule_name})"
  80. successive_items = list_item_operator * (min_items - 1)
  81. min_items -= 1
  82. else:
  83. first_item = f"({item_rule_name})?"
  84. max_items = schema.get("maxItems")
  85. if max_items is not None and max_items > min_items:
  86. successive_items += (list_item_operator + "?") * (max_items - min_items - 1)
  87. else:
  88. successive_items += list_item_operator + "*"
  89. rule = f'"[" space {first_item} {successive_items} "]" space'
  90. return self._add_rule(rule_name, rule)
  91. else:
  92. assert schema_type in PRIMITIVE_RULES, f'Unrecognized schema: {schema}'
  93. return self._add_rule(
  94. 'root' if rule_name == 'root' else schema_type,
  95. PRIMITIVE_RULES[schema_type]
  96. )
  97. def format_grammar(self):
  98. return '\n'.join((f'{name} ::= {rule}' for name, rule in self._rules.items()))
  99. def main(args_in = None):
  100. parser = argparse.ArgumentParser(
  101. description='''
  102. Generates a grammar (suitable for use in ./main) that produces JSON conforming to a
  103. given JSON schema. Only a subset of JSON schema features are supported; more may be
  104. added in the future.
  105. ''',
  106. )
  107. parser.add_argument(
  108. '--prop-order',
  109. default=[],
  110. type=lambda s: s.split(','),
  111. help='''
  112. comma-separated property names defining the order of precedence for object properties;
  113. properties not specified here are given lower precedence than those that are, and are
  114. sorted alphabetically
  115. '''
  116. )
  117. parser.add_argument('schema', help='file containing JSON schema ("-" for stdin)')
  118. args = parser.parse_args(args_in)
  119. schema = json.load(sys.stdin if args.schema == '-' else open(args.schema))
  120. prop_order = {name: idx for idx, name in enumerate(args.prop_order)}
  121. converter = SchemaConverter(prop_order)
  122. converter.visit(schema, '')
  123. print(converter.format_grammar())
  124. if __name__ == '__main__':
  125. main()