lexer.h 5.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157
  1. #pragma once
  2. #include "utils.h"
  3. #include <cctype>
  4. #include <map>
  5. #include <stdexcept>
  6. #include <string>
  7. #include <vector>
  8. namespace jinja {
  9. struct token {
  10. enum type {
  11. eof, // end of source
  12. text, // The text between Jinja statements or expressions
  13. numeric_literal, // e.g., 123, 1.0
  14. string_literal, // 'string'
  15. identifier, // Variables, functions, statements, booleans, etc.
  16. equals, // =
  17. open_paren, // (
  18. close_paren, // )
  19. open_statement, // {%
  20. close_statement, // %}
  21. open_expression, // {{
  22. close_expression, // }}
  23. open_square_bracket, // [
  24. close_square_bracket, // ]
  25. open_curly_bracket, // {
  26. close_curly_bracket, // }
  27. comma, // ,
  28. dot, // .
  29. colon, // :
  30. pipe, // |
  31. call_operator, // ()
  32. additive_binary_operator, // + - ~
  33. multiplicative_binary_operator, // * / %
  34. comparison_binary_operator, // < > <= >= == !=
  35. unary_operator, // ! - +
  36. comment, // {# ... #}
  37. };
  38. type t;
  39. std::string value;
  40. size_t pos;
  41. };
  42. static std::string type_to_string(token::type t) {
  43. switch (t) {
  44. case token::eof: return "eof";
  45. case token::text: return "text";
  46. case token::numeric_literal: return "numeric_literal";
  47. case token::string_literal: return "string_literal";
  48. case token::identifier: return "identifier";
  49. case token::equals: return "equals";
  50. case token::open_paren: return "open_paren";
  51. case token::close_paren: return "close_paren";
  52. case token::open_statement: return "open_statement";
  53. case token::close_statement: return "close_statement";
  54. case token::open_expression: return "open_expression";
  55. case token::close_expression: return "close_expression";
  56. case token::open_square_bracket: return "open_square_bracket";
  57. case token::close_square_bracket: return "close_square_bracket";
  58. case token::open_curly_bracket: return "open_curly_bracket";
  59. case token::close_curly_bracket: return "close_curly_bracket";
  60. case token::comma: return "comma";
  61. case token::dot: return "dot";
  62. case token::colon: return "colon";
  63. case token::pipe: return "pipe";
  64. case token::call_operator: return "call_operator";
  65. case token::additive_binary_operator: return "additive_binary_operator";
  66. case token::multiplicative_binary_operator: return "multiplicative_binary_operator";
  67. case token::comparison_binary_operator: return "comparison_binary_operator";
  68. case token::unary_operator: return "unary_operator";
  69. case token::comment: return "comment";
  70. default: return "unknown";
  71. }
  72. }
  73. struct lexer_result {
  74. std::vector<token> tokens;
  75. std::string source;
  76. };
  77. struct lexer {
  78. const std::map<char, char> escape_chars = {
  79. {'n', '\n'},
  80. {'t', '\t'},
  81. {'r', '\r'},
  82. {'b', '\b'},
  83. {'f', '\f'},
  84. {'v', '\v'},
  85. {'\\', '\\'},
  86. {'\'', '\''},
  87. {'\"', '\"'},
  88. };
  89. static bool is_word(char c) {
  90. return std::isalnum(static_cast<unsigned char>(c)) || c == '_';
  91. }
  92. static bool is_integer(char c) {
  93. return std::isdigit(static_cast<unsigned char>(c));
  94. }
  95. const std::vector<std::pair<std::string, token::type>> ordered_mapping_table = {
  96. // Trimmed control sequences
  97. {"{%-", token::open_statement},
  98. {"-%}", token::close_statement},
  99. {"{{-", token::open_expression},
  100. {"-}}", token::close_expression},
  101. // Control sequences
  102. {"{%", token::open_statement},
  103. {"%}", token::close_statement},
  104. {"{{", token::open_expression},
  105. {"}}", token::close_expression},
  106. // Single character tokens
  107. {"(", token::open_paren},
  108. {")", token::close_paren},
  109. {"{", token::open_curly_bracket},
  110. {"}", token::close_curly_bracket},
  111. {"[", token::open_square_bracket},
  112. {"]", token::close_square_bracket},
  113. {",", token::comma},
  114. {".", token::dot},
  115. {":", token::colon},
  116. {"|", token::pipe},
  117. // Comparison operators
  118. {"<=", token::comparison_binary_operator},
  119. {">=", token::comparison_binary_operator},
  120. {"==", token::comparison_binary_operator},
  121. {"!=", token::comparison_binary_operator},
  122. {"<", token::comparison_binary_operator},
  123. {">", token::comparison_binary_operator},
  124. // Arithmetic operators
  125. {"+", token::additive_binary_operator},
  126. {"-", token::additive_binary_operator},
  127. {"~", token::additive_binary_operator},
  128. {"*", token::multiplicative_binary_operator},
  129. {"/", token::multiplicative_binary_operator},
  130. {"%", token::multiplicative_binary_operator},
  131. // Assignment operator
  132. {"=", token::equals},
  133. };
  134. // tokenize the source string into a list of tokens
  135. // may throw lexer_exception on error
  136. lexer_result tokenize(const std::string & source);
  137. };
  138. struct lexer_exception : public std::runtime_error {
  139. lexer_exception(const std::string & msg, const std::string & source, size_t pos)
  140. : std::runtime_error(fmt_error_with_source("lexer", msg, source, pos)) {}
  141. };
  142. } // namespace jinja