table-html-restorer.ts 6.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181
  1. /**
  2. * Rehype plugin to restore limited HTML elements inside Markdown table cells.
  3. *
  4. * ## Problem
  5. * The remark/rehype pipeline neutralizes inline HTML as literal text
  6. * (remarkLiteralHtml) so that XML/HTML snippets in LLM responses display
  7. * as-is instead of being rendered. This causes <br> and <ul> markup in
  8. * table cells to show as plain text.
  9. *
  10. * ## Solution
  11. * This plugin traverses the HAST post-conversion, parses whitelisted HTML
  12. * patterns from text nodes, and replaces them with actual HAST element nodes
  13. * that will be rendered as real HTML.
  14. *
  15. * ## Supported HTML
  16. * - `<br>` / `<br/>` / `<br />` - Line breaks (inline)
  17. * - `<ul><li>...</li></ul>` - Unordered lists (block)
  18. *
  19. * ## Key Implementation Details
  20. *
  21. * ### 1. Sibling Combination (Critical)
  22. * The Markdown pipeline may fragment content across multiple text nodes and `<br>`
  23. * elements. For example, `<ul><li>a</li></ul>` might arrive as:
  24. * - Text: `"<ul>"`
  25. * - Element: `<br>`
  26. * - Text: `"<li>a</li></ul>"`
  27. *
  28. * We must combine consecutive text nodes and `<br>` elements into a single string
  29. * before attempting to parse list markup. Without this, list detection fails.
  30. *
  31. * ### 2. visitParents for Deep Traversal
  32. * Table cell content may be wrapped in intermediate elements (e.g., `<p>` tags).
  33. * Using `visitParents` instead of direct child iteration ensures we find text
  34. * nodes at any depth within the cell.
  35. *
  36. * ### 3. Reference Comparison for No-Op Detection
  37. * When checking if `<br>` expansion changed anything, we compare:
  38. * `expanded.length !== 1 || expanded[0] !== textNode`
  39. *
  40. * This catches both cases:
  41. * - Multiple nodes created (text was split)
  42. * - Single NEW node created (original had only `<br>`, now it's an element)
  43. *
  44. * A simple `length > 1` check would miss the single `<br>` case.
  45. *
  46. * ### 4. Strict List Validation
  47. * `parseList()` rejects malformed markup by checking for garbage text between
  48. * `<li>` elements. This prevents creating broken DOM from partial matches like
  49. * `<ul>garbage<li>a</li></ul>`.
  50. *
  51. * ### 5. Newline Substitution for `<br>` in Combined String
  52. * When combining siblings, existing `<br>` elements become `\n` in the combined
  53. * string. This allows list content to span visual lines while still being parsed
  54. * as a single unit.
  55. *
  56. * @example
  57. * // Input Markdown:
  58. * // | Feature | Notes |
  59. * // |---------|-------|
  60. * // | Multi-line | First<br>Second |
  61. * // | List | <ul><li>A</li><li>B</li></ul> |
  62. * //
  63. * // Without this plugin: <br> and <ul> render as literal text
  64. * // With this plugin: <br> becomes line break, <ul> becomes actual list
  65. */
  66. import type { Plugin } from 'unified';
  67. import type { Element, ElementContent, Root, Text } from 'hast';
  68. import { visit } from 'unist-util-visit';
  69. import { visitParents } from 'unist-util-visit-parents';
  70. import { BR_PATTERN, LIST_PATTERN, LI_PATTERN } from '$lib/constants/table-html-restorer';
  71. /**
  72. * Expands text containing `<br>` tags into an array of text nodes and br elements.
  73. */
  74. function expandBrTags(value: string): ElementContent[] {
  75. const matches = [...value.matchAll(BR_PATTERN)];
  76. if (!matches.length) return [{ type: 'text', value } as Text];
  77. const result: ElementContent[] = [];
  78. let cursor = 0;
  79. for (const m of matches) {
  80. if (m.index! > cursor) {
  81. result.push({ type: 'text', value: value.slice(cursor, m.index) } as Text);
  82. }
  83. result.push({ type: 'element', tagName: 'br', properties: {}, children: [] } as Element);
  84. cursor = m.index! + m[0].length;
  85. }
  86. if (cursor < value.length) {
  87. result.push({ type: 'text', value: value.slice(cursor) } as Text);
  88. }
  89. return result;
  90. }
  91. /**
  92. * Parses a `<ul><li>...</li></ul>` string into a HAST element.
  93. * Returns null if the markup is malformed or contains unexpected content.
  94. */
  95. function parseList(value: string): Element | null {
  96. const match = value.trim().match(LIST_PATTERN);
  97. if (!match) return null;
  98. const body = match[1];
  99. const items: ElementContent[] = [];
  100. let cursor = 0;
  101. for (const liMatch of body.matchAll(LI_PATTERN)) {
  102. // Reject if there's non-whitespace between list items
  103. if (body.slice(cursor, liMatch.index!).trim()) return null;
  104. items.push({
  105. type: 'element',
  106. tagName: 'li',
  107. properties: {},
  108. children: expandBrTags(liMatch[1] ?? '')
  109. } as Element);
  110. cursor = liMatch.index! + liMatch[0].length;
  111. }
  112. // Reject if no items found or trailing garbage exists
  113. if (!items.length || body.slice(cursor).trim()) return null;
  114. return { type: 'element', tagName: 'ul', properties: {}, children: items } as Element;
  115. }
  116. /**
  117. * Processes a single table cell, restoring HTML elements from text content.
  118. */
  119. function processCell(cell: Element) {
  120. visitParents(cell, 'text', (textNode: Text, ancestors) => {
  121. const parent = ancestors[ancestors.length - 1];
  122. if (!parent || parent.type !== 'element') return;
  123. const parentEl = parent as Element;
  124. const siblings = parentEl.children as ElementContent[];
  125. const startIndex = siblings.indexOf(textNode as ElementContent);
  126. if (startIndex === -1) return;
  127. // Combine consecutive text nodes and <br> elements into one string
  128. let combined = '';
  129. let endIndex = startIndex;
  130. for (let i = startIndex; i < siblings.length; i++) {
  131. const sib = siblings[i];
  132. if (sib.type === 'text') {
  133. combined += (sib as Text).value;
  134. endIndex = i;
  135. } else if (sib.type === 'element' && (sib as Element).tagName === 'br') {
  136. combined += '\n';
  137. endIndex = i;
  138. } else {
  139. break;
  140. }
  141. }
  142. // Try parsing as list first (replaces entire combined range)
  143. const list = parseList(combined);
  144. if (list) {
  145. siblings.splice(startIndex, endIndex - startIndex + 1, list);
  146. return;
  147. }
  148. // Otherwise, just expand <br> tags in this text node
  149. const expanded = expandBrTags(textNode.value);
  150. if (expanded.length !== 1 || expanded[0] !== textNode) {
  151. siblings.splice(startIndex, 1, ...expanded);
  152. }
  153. });
  154. }
  155. export const rehypeRestoreTableHtml: Plugin<[], Root> = () => (tree) => {
  156. visit(tree, 'element', (node: Element) => {
  157. if (node.tagName === 'td' || node.tagName === 'th') {
  158. processCell(node);
  159. }
  160. });
  161. };