lexer.cpp 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341
  1. #include "lexer.h"
  2. #include "runtime.h"
  3. #include <cctype>
  4. #include <functional>
  5. #include <map>
  6. #include <string>
  7. #include <vector>
  8. #define FILENAME "jinja-lexer"
  9. namespace jinja {
  10. static void string_lstrip(std::string & s, const char * chars) {
  11. size_t start = s.find_first_not_of(chars);
  12. if (start == std::string::npos) {
  13. s.clear();
  14. } else {
  15. s.erase(0, start);
  16. }
  17. }
  18. static void string_rstrip(std::string & s, const char * chars) {
  19. size_t end = s.find_last_not_of(chars);
  20. if (end == std::string::npos) {
  21. s.clear();
  22. } else {
  23. s.erase(end + 1);
  24. }
  25. }
  26. lexer_result lexer::tokenize(const std::string & source) {
  27. std::vector<token> tokens;
  28. // NOTE: do NOT transform the source string (i.e. preprocessing), as we need to keep
  29. // the original character positions for error reporting etc.
  30. std::string src = source;
  31. if (source.empty()) {
  32. return {tokens, src};
  33. }
  34. // Normalize \r\n or \r to \n
  35. for (std::string::size_type pos = 0; (pos = src.find("\r\n", pos)) != std::string::npos; ) {
  36. src.erase(pos, 1);
  37. ++pos;
  38. }
  39. for (std::string::size_type pos = 0; (pos = src.find("\r", pos)) != std::string::npos; ) {
  40. src.replace(pos, 1, 1, '\n');
  41. ++pos;
  42. }
  43. // In the default configuration:
  44. // - a single trailing newline is stripped if present
  45. // - other whitespace (spaces, tabs, newlines etc.) is returned unchanged
  46. if (source.back() == '\n') {
  47. src.pop_back();
  48. }
  49. size_t pos = 0;
  50. size_t start_pos = 0;
  51. size_t curly_bracket_depth = 0;
  52. using pred = std::function<bool(char)>;
  53. auto consume_while = [&](const pred & predicate) -> std::string {
  54. std::string str;
  55. while (predicate(src[pos])) {
  56. // check for escape char
  57. if (src[pos] == '\\') {
  58. // consume backslash
  59. ++pos;
  60. // check for end of input
  61. if (pos >= src.size()) {
  62. throw lexer_exception("unexpected end of input after escape character", source, pos);
  63. }
  64. // add escaped char
  65. char escaped_char = src[pos++];
  66. if (escape_chars.find(escaped_char) == escape_chars.end()) {
  67. throw lexer_exception(std::string("unknown escape character \\") + escaped_char, source, pos);
  68. }
  69. char unescaped_char = escape_chars.at(escaped_char);
  70. str += unescaped_char;
  71. continue;
  72. }
  73. str += src[pos++];
  74. if (pos > src.size()) {
  75. throw lexer_exception("unexpected end of input during consume_while", source, pos);
  76. }
  77. }
  78. return str;
  79. };
  80. auto consume_numeric = [&]() -> std::string {
  81. std::string num = consume_while(is_integer);
  82. if (pos < src.size() && src[pos] == '.' && pos + 1 < src.size() && is_integer(src[pos + 1])) {
  83. ++pos; // Consume '.'
  84. std::string frac = consume_while(is_integer);
  85. num += "." + frac;
  86. }
  87. return num;
  88. };
  89. auto next_pos_is = [&](std::initializer_list<char> chars, size_t n = 1) -> bool {
  90. if (pos + n >= src.size()) return false;
  91. for (char c : chars) {
  92. if (src[pos + n] == c) return true;
  93. }
  94. return false;
  95. };
  96. // note: default config for chat template: lstrip_blocks = true, trim_blocks = true
  97. // text\n[space]{block} --> text\n{block}
  98. bool opt_lstrip_blocks = true;
  99. // {block}\n[space]text --> {block}[space]text
  100. bool opt_trim_blocks = true;
  101. // options set dynamically based on current/last block
  102. bool is_lstrip_block = false; // example: {%-
  103. bool is_rstrip_block = false; // example: -%}
  104. while (pos < src.size()) {
  105. start_pos = pos;
  106. // JJ_DEBUG("lexer main loop at pos %zu: '%s...'", pos, src.substr(pos, 10).c_str());
  107. // First, consume all text that is outside of a Jinja statement or expression
  108. token::type last_token_type = tokens.empty()
  109. ? token::close_statement // initial state
  110. : tokens.back().t;
  111. if (last_token_type == token::close_statement ||
  112. last_token_type == token::close_expression ||
  113. last_token_type == token::comment) {
  114. bool last_block_can_rm_newline = false;
  115. is_rstrip_block = false;
  116. if (pos > 3) {
  117. char c0 = src[pos - 3];
  118. char c1 = src[pos - 2];
  119. char c2 = src[pos - 1];
  120. // strip if: -[%}#]}text
  121. is_rstrip_block = c0 == '-'
  122. && (c1 == '%' || c1 == '}' || c1 == '#')
  123. && c2 == '}';
  124. // match behavior of hf.js: exclude {{ and }} cases, regex: ([#%-]})
  125. last_block_can_rm_newline = (c1 == '#' || c1 == '%' || c1 == '-') && c2 == '}';
  126. }
  127. size_t start = pos;
  128. size_t end = start;
  129. while (pos < src.size() &&
  130. // Keep going until we hit the next Jinja statement or expression
  131. !(
  132. src[pos] == '{' &&
  133. next_pos_is( {'%', '{', '#'} )
  134. )) {
  135. end = ++pos;
  136. }
  137. // equivalent to hf.js code: template.replace(/^[ \t]*({[#%-])/gm, "$1");
  138. if (opt_lstrip_blocks && src[pos] == '{' && next_pos_is({'%', '#', '-'})) {
  139. size_t current = end;
  140. while (current > start) {
  141. char c = src[current - 1];
  142. if (current == 1) {
  143. end = 0; // Trim from the start of the string
  144. break;
  145. }
  146. if (c == '\n') {
  147. end = current; // Trim from the start of the line
  148. break;
  149. }
  150. if (!std::isspace(static_cast<unsigned char>(c))) {
  151. break; // Found non-whitespace before newline, keep
  152. }
  153. --current;
  154. }
  155. }
  156. std::string text = src.substr(start, end - start);
  157. // equivalent to hf.js code: template.replace(/([#%-]})\n/g, "$1");
  158. if (opt_trim_blocks && last_block_can_rm_newline) {
  159. if (!text.empty() && text.front() == '\n') {
  160. text.erase(text.begin());
  161. }
  162. }
  163. if (is_rstrip_block) {
  164. // example: {last_block}[space]text
  165. // doing lstrip on text, effectively rstrip the LAST block
  166. // JJ_DEBUG("RSTRIP block detected, current text: '%s'", text.c_str());
  167. string_lstrip(text, " \t\r\n");
  168. }
  169. is_lstrip_block = src[pos] == '{' && next_pos_is({'{', '%', '#'}) && next_pos_is({'-'}, 2);
  170. if (is_lstrip_block) {
  171. // example: text[space]{current_block}
  172. // doing rstrip on text, effectively lstrip the CURRENT block
  173. // JJ_DEBUG("LSTRIP block detected, current text: '%s'", text.c_str());
  174. string_rstrip(text, " \t\r\n");
  175. }
  176. if (!text.empty()) {
  177. // JJ_DEBUG("consumed text: '%s'", text.c_str());
  178. tokens.push_back({token::text, text, start_pos});
  179. continue;
  180. }
  181. }
  182. // Possibly consume a comment
  183. // TODO: handle lstrip/rstrip for comments? (not important for now)
  184. if (src[pos] == '{' && next_pos_is( {'#'} )) {
  185. start_pos = pos;
  186. pos += 2; // Skip the opening {#
  187. std::string comment;
  188. while (!(src[pos] == '#' && next_pos_is( {'}'} ))) {
  189. if (pos + 2 >= src.size()) {
  190. throw lexer_exception("missing end of comment tag", source, pos);
  191. }
  192. comment += src[pos++];
  193. }
  194. JJ_DEBUG("consumed comment: '%s'", comment.c_str());
  195. tokens.push_back({token::comment, comment, start_pos});
  196. pos += 2; // Skip the closing #}
  197. continue;
  198. }
  199. if (src[pos] == '-' && (
  200. last_token_type == token::open_expression ||
  201. last_token_type == token::open_statement)
  202. ) {
  203. JJ_DEBUG("lexer main loop at pos %zu: '%s...'", pos, src.substr(pos, 10).c_str());
  204. pos++; // consume '-' in {%- or {{-
  205. if (pos >= src.size()) break;
  206. }
  207. // Consume (and ignore) all whitespace inside Jinja statements or expressions
  208. consume_while([](char c) { return std::isspace(static_cast<unsigned char>(c)); });
  209. if (pos >= src.size()) break;
  210. char ch = src[pos];
  211. bool is_closing_block = ch == '-' && next_pos_is( {'%', '}'} );
  212. // Check for unary operators
  213. if (!is_closing_block && (ch == '-' || ch == '+')) {
  214. start_pos = pos;
  215. token::type last_token_type = tokens.empty() ? token::eof : tokens.back().t;
  216. if (last_token_type == token::text || last_token_type == token::eof) {
  217. throw lexer_exception(std::string("unexpected character: ") + ch, source, pos);
  218. }
  219. switch (last_token_type) {
  220. case token::identifier:
  221. case token::numeric_literal:
  222. case token::string_literal:
  223. case token::close_paren:
  224. case token::close_square_bracket:
  225. // Part of a binary operator
  226. // a - 1, 1 - 1, true - 1, "apple" - 1, (1) - 1, a[1] - 1
  227. // Continue parsing normally
  228. break;
  229. default: {
  230. // Is part of a unary operator
  231. // (-1), [-1], (1 + -1), not -1, -apple
  232. ++pos; // Consume the operator
  233. // Check for numbers following the unary operator
  234. std::string num = consume_numeric();
  235. std::string value = std::string(1, ch) + num;
  236. token::type t = num.empty() ? token::unary_operator : token::numeric_literal;
  237. // JJ_DEBUG("consumed unary operator or numeric literal: '%s'", value.c_str());
  238. tokens.push_back({t, value, start_pos});
  239. continue;
  240. }
  241. }
  242. }
  243. // Try to match one of the tokens in the mapping table
  244. bool matched = false;
  245. for (const auto & [seq, typ] : ordered_mapping_table) {
  246. start_pos = pos;
  247. // Inside an object literal, don't treat "}}" as expression-end
  248. if (seq == "}}" && curly_bracket_depth > 0) {
  249. continue;
  250. }
  251. if (pos + seq.size() <= src.size() && src.substr(pos, seq.size()) == seq) {
  252. tokens.push_back({typ, seq, start_pos});
  253. if (typ == token::open_expression) {
  254. curly_bracket_depth = 0;
  255. } else if (typ == token::open_curly_bracket) {
  256. ++curly_bracket_depth;
  257. } else if (typ == token::close_curly_bracket) {
  258. --curly_bracket_depth;
  259. }
  260. pos += seq.size();
  261. matched = true;
  262. break; // continue main loop
  263. }
  264. }
  265. if (matched) continue; // continue main loop
  266. // Strings
  267. if (ch == '\'' || ch == '"') {
  268. start_pos = pos;
  269. ++pos; // Skip opening quote
  270. std::string str = consume_while([ch](char c) { return c != ch; });
  271. // JJ_DEBUG("consumed string literal: '%s'", str.c_str());
  272. tokens.push_back({token::string_literal, str, start_pos});
  273. ++pos; // Skip closing quote
  274. continue;
  275. }
  276. // Numbers
  277. if (is_integer(ch)) {
  278. start_pos = pos;
  279. std::string num = consume_numeric();
  280. // JJ_DEBUG("consumed numeric literal: '%s'", num.c_str());
  281. tokens.push_back({token::numeric_literal, num, start_pos});
  282. continue;
  283. }
  284. // Identifiers
  285. if (is_word(ch)) {
  286. start_pos = pos;
  287. std::string word = consume_while(is_word);
  288. // JJ_DEBUG("consumed identifier: '%s'", word.c_str());
  289. tokens.push_back({token::identifier, word, start_pos});
  290. continue;
  291. }
  292. throw lexer_exception(std::string("unexpected character: ") + ch, source, pos);
  293. }
  294. return {std::move(tokens), src};
  295. }
  296. } // namespace jinja