|
1 | | -import { load } from 'cheerio' |
2 | 1 | import { decode } from 'html-entities' |
3 | 2 |
|
| 3 | +// Strip all HTML tags, leaving only text content. |
| 4 | +// Handles nested tags like `<p>text with <code>code</code></p>`. |
| 5 | +const TAG_RE = /<[^>]+>/g |
| 6 | + |
4 | 7 | // Given a piece of HTML return it without HTML. E.g. |
5 | 8 | // `<p>Foo & bar</p>` becomes `Foo & bar` |
6 | 9 | // and `A <a href="">link</a> and <code>code</code>` becomes `A link and code`. |
7 | | -// Take advantage of the subtle fact that a lot of the times, the html value |
8 | | -// we get here is a single line that starts with `<p>` and ends with `</p>` |
9 | | -// and contains no longer HTML tags. |
| 10 | +// |
| 11 | +// This operates on trusted rendered HTML from our own render pipeline, |
| 12 | +// not user-supplied input. The output is used for plain-text display only |
| 13 | +// (mini-TOC items, search descriptions, etc.). |
10 | 14 | export function fastTextOnly(html: string): string { |
11 | 15 | if (!html) return '' |
| 16 | + // Fast path: simple `<p>text</p>` with no inner tags |
12 | 17 | if (html.startsWith('<p>') && html.endsWith('</p>')) { |
13 | 18 | const middle = html.slice(3, -4) |
14 | 19 | if (!middle.includes('<')) return decode(middle.trim()) |
15 | 20 | } |
16 | | - const $ = load(html, { xmlMode: true }) |
17 | | - return $.root().text().trim() |
| 21 | + // Strip all tags and decode entities. |
| 22 | + return decode(html.replace(TAG_RE, '').trim()) // lgtm[js/incomplete-multi-character-sanitization] |
18 | 23 | } |
0 commit comments