Add basic HTML parser for converting strings to React components (#36071)

This commit is contained in:
Echo 2025-09-11 11:22:44 +02:00 committed by GitHub
parent 2314583606
commit 8a0d0025ff
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 347 additions and 0 deletions

View File

@ -0,0 +1,69 @@
// Vitest Snapshot v1, https://vitest.dev/guide/snapshot.html
exports[`html > htmlStringToComponents > copies attributes to props 1`] = `
[
<a
href="https://example.com"
rel="nofollow"
target="_blank"
>
link
</a>,
]
`;
exports[`html > htmlStringToComponents > handles nested elements 1`] = `
[
<p>
lorem
<strong>
ipsum
</strong>
</p>,
]
`;
exports[`html > htmlStringToComponents > ignores empty text nodes 1`] = `
[
<p>
<span>
lorem ipsum
</span>
</p>,
]
`;
exports[`html > htmlStringToComponents > respects allowedTags option 1`] = `
[
<p>
lorem
<em>
dolor
</em>
</p>,
]
`;
exports[`html > htmlStringToComponents > respects maxDepth option 1`] = `
[
<p>
<span />
</p>,
]
`;
exports[`html > htmlStringToComponents > returns converted nodes from string 1`] = `
[
<p>
lorem ipsum
</p>,
]
`;
exports[`html > htmlStringToComponents > uses default parsing if onElement returns undefined 1`] = `
[
<p>
lorem ipsum
</p>,
]
`;

View File

@ -1,3 +1,5 @@
import React from 'react';
import * as html from '../html';
describe('html', () => {
@ -9,4 +11,104 @@ describe('html', () => {
expect(output).toEqual('lorem\n\nipsum\n<br>');
});
});
describe('htmlStringToComponents', () => {
it('returns converted nodes from string', () => {
const input = '<p>lorem ipsum</p>';
const output = html.htmlStringToComponents(input);
expect(output).toMatchSnapshot();
});
it('handles nested elements', () => {
const input = '<p>lorem <strong>ipsum</strong></p>';
const output = html.htmlStringToComponents(input);
expect(output).toMatchSnapshot();
});
it('ignores empty text nodes', () => {
const input = '<p> <span>lorem ipsum</span> </p>';
const output = html.htmlStringToComponents(input);
expect(output).toMatchSnapshot();
});
it('copies attributes to props', () => {
const input =
'<a href="https://example.com" target="_blank" rel="nofollow">link</a>';
const output = html.htmlStringToComponents(input);
expect(output).toMatchSnapshot();
});
it('respects maxDepth option', () => {
const input = '<p><span>lorem <strong>ipsum</strong></span></p>';
const output = html.htmlStringToComponents(input, { maxDepth: 2 });
expect(output).toMatchSnapshot();
});
it('calls onText callback', () => {
const input = '<p>lorem ipsum</p>';
const onText = vi.fn((text: string) => text);
html.htmlStringToComponents(input, { onText });
expect(onText).toHaveBeenCalledExactlyOnceWith('lorem ipsum');
});
it('calls onElement callback', () => {
const input = '<p>lorem ipsum</p>';
const onElement = vi.fn(
(element: HTMLElement, children: React.ReactNode[]) =>
React.createElement(element.tagName.toLowerCase(), {}, ...children),
);
html.htmlStringToComponents(input, { onElement });
expect(onElement).toHaveBeenCalledExactlyOnceWith(
expect.objectContaining({ tagName: 'P' }),
expect.arrayContaining(['lorem ipsum']),
);
});
it('uses default parsing if onElement returns undefined', () => {
const input = '<p>lorem ipsum</p>';
const onElement = vi.fn(() => undefined);
const output = html.htmlStringToComponents(input, { onElement });
expect(onElement).toHaveBeenCalledExactlyOnceWith(
expect.objectContaining({ tagName: 'P' }),
expect.arrayContaining(['lorem ipsum']),
);
expect(output).toMatchSnapshot();
});
it('calls onAttribute callback', () => {
const input =
'<a href="https://example.com" target="_blank" rel="nofollow">link</a>';
const onAttribute = vi.fn(
(name: string, value: string) =>
[name, value] satisfies [string, string],
);
html.htmlStringToComponents(input, { onAttribute });
expect(onAttribute).toHaveBeenCalledTimes(3);
expect(onAttribute).toHaveBeenCalledWith(
'href',
'https://example.com',
'a',
);
expect(onAttribute).toHaveBeenCalledWith('target', '_blank', 'a');
expect(onAttribute).toHaveBeenCalledWith('rel', 'nofollow', 'a');
});
it('respects allowedTags option', () => {
const input = '<p>lorem <strong>ipsum</strong> <em>dolor</em></p>';
const output = html.htmlStringToComponents(input, {
allowedTags: new Set(['p', 'em']),
});
expect(output).toMatchSnapshot();
});
it('ensure performance is acceptable with large input', () => {
const input = '<p>' + '<span>lorem</span>'.repeat(1_000) + '</p>';
const start = performance.now();
html.htmlStringToComponents(input);
const duration = performance.now() - start;
// Arbitrary threshold of 200ms for this test.
// Normally it's much less (<50ms), but the GH Action environment can be slow.
expect(duration).toBeLessThan(200);
});
});
});

View File

@ -1,3 +1,5 @@
import React from 'react';
// NB: This function can still return unsafe HTML
export const unescapeHTML = (html: string) => {
const wrapper = document.createElement('div');
@ -7,3 +9,177 @@ export const unescapeHTML = (html: string) => {
.replace(/<[^>]*>/g, '');
return wrapper.textContent;
};
interface QueueItem {
node: Node;
parent: React.ReactNode[];
depth: number;
}
interface Options {
maxDepth?: number;
onText?: (text: string) => React.ReactNode;
onElement?: (
element: HTMLElement,
children: React.ReactNode[],
) => React.ReactNode;
onAttribute?: (
name: string,
value: string,
tagName: string,
) => [string, unknown] | null;
allowedTags?: Set<string>;
}
const DEFAULT_ALLOWED_TAGS: ReadonlySet<string> = new Set([
'a',
'abbr',
'b',
'blockquote',
'br',
'cite',
'code',
'del',
'dfn',
'dl',
'dt',
'em',
'h1',
'h2',
'h3',
'h4',
'h5',
'h6',
'hr',
'i',
'li',
'ol',
'p',
'pre',
'small',
'span',
'strong',
'sub',
'sup',
'time',
'u',
'ul',
]);
export function htmlStringToComponents(
htmlString: string,
options: Options = {},
) {
const wrapper = document.createElement('template');
wrapper.innerHTML = htmlString;
const rootChildren: React.ReactNode[] = [];
const queue: QueueItem[] = [
{ node: wrapper.content, parent: rootChildren, depth: 0 },
];
const {
maxDepth = 10,
allowedTags = DEFAULT_ALLOWED_TAGS,
onAttribute,
onElement,
onText,
} = options;
while (queue.length > 0) {
const item = queue.shift();
if (!item) {
break;
}
const { node, parent, depth } = item;
// If maxDepth is exceeded, skip processing this node.
if (depth > maxDepth) {
continue;
}
switch (node.nodeType) {
// Just process children for fragments.
case Node.DOCUMENT_FRAGMENT_NODE: {
for (const child of node.childNodes) {
queue.push({ node: child, parent, depth: depth + 1 });
}
break;
}
// Text can be added directly if it has any non-whitespace content.
case Node.TEXT_NODE: {
const text = node.textContent;
if (text && text.trim() !== '') {
if (onText) {
parent.push(onText(text));
} else {
parent.push(text);
}
}
break;
}
// Process elements with attributes and then their children.
case Node.ELEMENT_NODE: {
if (!(node instanceof HTMLElement)) {
console.warn('Expected HTMLElement, got', node);
continue;
}
// If the tag is not allowed, skip it and its children.
if (!allowedTags.has(node.tagName.toLowerCase())) {
continue;
}
// Create the element and add it to the parent.
const children: React.ReactNode[] = [];
let element: React.ReactNode = undefined;
// If onElement is provided, use it to create the element.
if (onElement) {
const component = onElement(node, children);
// Check for undefined to allow returning null.
if (component !== undefined) {
element = component;
}
}
// If the element wasn't created, use the default conversion.
if (element === undefined) {
const props: Record<string, unknown> = {};
for (const attr of node.attributes) {
if (onAttribute) {
const result = onAttribute(
attr.name,
attr.value,
node.tagName.toLowerCase(),
);
if (result) {
const [name, value] = result;
props[name] = value;
}
} else {
props[attr.name] = attr.value;
}
}
element = React.createElement(
node.tagName.toLowerCase(),
props,
children,
);
}
// Push the element to the parent.
parent.push(element);
// Iterate over the node children with the newly created component.
for (const child of node.childNodes) {
queue.push({ node: child, parent: children, depth: depth + 1 });
}
break;
}
}
}
return rootChildren;
}