Skip to content

Commit 22ee01b

Browse files
committed
refactor(ttml): improve extractText() to preserve spaces and special characters
- Replaced `text()` with `getWholeText()`: - avoids losing whitespaces at the beginning, end, or within the text; - avoids merging two or more consecutive spaces into a single space ' '; - avoids converting '\r', '\n', and '\r\n' within the text into a single space ' '; For subtitle conversion, the goal is to preserve every character exactly as intended by the subtitle author. - Normalized tabs, line breaks, and other special characters for SRT-safe output. - Added comprehensive unit tests in `SrtFromTtmlWriterTest.java`, including cases for simple and nested tags.
1 parent e1888ed commit 22ee01b

2 files changed

Lines changed: 488 additions & 1 deletion

File tree

app/src/main/java/org/schabi/newpipe/streams/SrtFromTtmlWriter.java

Lines changed: 168 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,157 @@ private void writeString(final String text) throws IOException {
5454
out.write(text.getBytes(charset));
5555
}
5656

57+
/**
58+
* Decode XML or HTML entities into their actual (literal) characters.
59+
*
60+
* TTML is XML-based, so text nodes may contain escaped entities
61+
* instead of direct characters. For example:
62+
*
63+
* "&" → "&"
64+
* "&lt;" → "<"
65+
* "&gt;" → ">"
66+
* "&#x9;" → "\t" (TAB)
67+
* "&#xA;" (&#10;) → "\n" (LINE FEED)
68+
*
69+
* XML files cannot contain characters like "<", ">", "&" directly,
70+
* so they must be represented using their entity-encoded forms.
71+
*
72+
* Jsoup sometimes leaves nested or encoded entities unresolved
73+
* (e.g. inside <p> text nodes in TTML files), so this function
74+
* acts as a final “safety net” to ensure all entities are decoded
75+
* before further normalization.
76+
*
77+
* Character representation layers for reference:
78+
* - Literal characters: <, >, &
79+
* → appear in runtime/output text (e.g. final SRT output)
80+
* - Escaped entities: &lt;, &gt;, &amp;
81+
* → appear in XML/HTML/TTML source files
82+
* - Numeric entities: &#xA0;, &#x9;, &#xD;
83+
* → appear mainly in XML/TTML files (also valid in HTML)
84+
* for non-printable or special characters
85+
* - Unicode escapes: \u00A0 (Java/Unicode internal form)
86+
* → appear only in Java source code (NOT valid in XML)
87+
*
88+
* XML entities include both named (&amp;, &lt;) and numeric
89+
* (&#xA0;, &#160;) forms.
90+
*
91+
* @param encodedEntities The raw text fragment possibly containing
92+
* encoded XML entities.
93+
* @return A decoded string where all entities are replaced by their
94+
* actual (literal) characters.
95+
*/
96+
private String decodeXmlEntities(final String encodedEntities) {
97+
final String decoded = Parser.unescapeEntities(encodedEntities, true);
98+
return decoded;
99+
}
100+
101+
/**
102+
* Handle rare XML entity characters like LF: &#xA;(`\n`)
103+
* , CR: &#xD;(`\r`) and CRLF: (`\r\n`).
104+
*
105+
* These are technically valid in TTML (XML allows them)
106+
* but unusual in practice, since most TTML line breaks
107+
* are represented as <br/> tags instead.
108+
* As a defensive approach, we normalize them:
109+
*
110+
* - Windows (\r\n), macOS (\r), and Unix (\n) → unified SRT NEW_LINE (\r\n)
111+
*
112+
* Although well-formed TTML normally encodes line breaks
113+
* as <br/> tags, some auto-generated or malformed TTML files
114+
* may embed literal newline entities (&#xA;, &#xD;). This
115+
* normalization ensures these cases render properly in SRT
116+
* players instead of breaking the subtitle structure.
117+
*
118+
* @param text To be normalized text with actual characters.
119+
* @return Unified SRT NEW_LINE converted from all kinds of line breaks.
120+
*/
121+
private String normalizeLineBreakForSrt(final String text) {
122+
String cleaned = text;
123+
124+
// NOTE:
125+
// The order of newline replacements must NOT change,
126+
// or duplicated line breaks (e.g. \r\n → \n\n) will occur.
127+
cleaned = cleaned.replace("\r\n", "\n")
128+
.replace("\r", "\n");
129+
130+
cleaned = cleaned.replace("\n", NEW_LINE);
131+
132+
return cleaned;
133+
}
134+
135+
private String normalizeForSrt(final String actualText) {
136+
String cleaned = actualText;
137+
138+
// Replace non-breaking space (\u00A0) with regular space ' '(\u0020).
139+
// - YouTube TTML subtitles use both regular spaces (\u0020)
140+
// and non-breaking spaces (\u00A0).
141+
// - SRT subtitles only support regular spaces (\u0020),
142+
// so \u00A0 may cause display issues.
143+
// - \u00A0 and \u0020 are visually identical (i.e., they both
144+
// appear as spaces ' '), but they differ in Unicode encoding,
145+
// leading to test failures (e.g., ComparisonFailure).
146+
// - Convert \u00A0 to \u0020 to ensure consistency in subtitle
147+
// formatting.
148+
// - References:
149+
// - Unicode General Punctuation: https://unicode.org/charts/PDF/U2000.pdf
150+
// - TTML Spec: https://www.w3.org/TR/ttml2/
151+
// - SRT Format: https://en.wikipedia.org/wiki/SubRip
152+
cleaned = cleaned.replace('\u00A0', ' ') // Non-breaking space
153+
.replace('\u202F', ' ') // Narrow no-break space
154+
.replace('\u205F', ' ') // Medium mathematical space
155+
.replace('\u3000', ' ') // Ideographic space
156+
// \u2000 ~ \u200A are whitespace characters (e.g.,
157+
// en space, em space), replaced with regular space (\u0020).
158+
.replaceAll("[\\u2000-\\u200A]", " "); // Whitespace characters
159+
160+
// \u200B ~ \u200F are a range of non-spacing characters
161+
// (e.g., zero-width space, zero-width non-joiner, etc.),
162+
// which have no effect in *.SRT files and may cause
163+
// display issues.
164+
// These characters are invisible to the human eye, and
165+
// they still exist in the encoding, so they need to be
166+
// removed.
167+
// After removal, the actual content becomes completely
168+
// empty "", meaning there are no characters left, just
169+
// an empty space, which helps avoid formatting issues
170+
// in subtitles.
171+
cleaned = cleaned.replaceAll("[\\u200B-\\u200F]", ""); // Non-spacing characters
172+
173+
// Remove control characters (\u0000 ~ \u001F, except
174+
// \n, \r, \t).
175+
// - These are ASCII C0 control codes (e.g. \u0001 SOH,
176+
// \u0008 BS, \u001F US), invisible and irrelevant in
177+
// subtitles, may cause square boxes (?) in players.
178+
// - Reference:
179+
// Unicode Basic Latin (https://unicode.org/charts/PDF/U0000.pdf)
180+
// ASCII Control (https://en.wikipedia.org/wiki/ASCII#Control_characters)
181+
cleaned = cleaned.replaceAll("[\\u0000-\\u0008\\u000B\\u000C\\u000E-\\u001F]", "");
182+
183+
// Reasoning:
184+
// - subtitle files generally don't require tabs for alignment.
185+
// - Tabs can be displayed with varying widths across different
186+
// editors or platforms, which may cause display issues.
187+
// - Replace it with a single space for consistent display
188+
// across different editors or platforms.
189+
cleaned = cleaned.replace('\t', ' ');
190+
191+
cleaned = normalizeLineBreakForSrt(cleaned);
192+
193+
return cleaned;
194+
}
195+
196+
private String sanitizeFragment(final String raw) {
197+
if (null == raw) {
198+
return "";
199+
}
200+
201+
final String actualCharacters = decodeXmlEntities(raw);
202+
203+
final String srtSafeText = normalizeForSrt(actualCharacters);
204+
205+
return srtSafeText;
206+
}
207+
57208
// CHECKSTYLE:OFF checkstyle:JavadocStyle
58209
// checkstyle does not understand that span tags are inside a code block
59210
/**
@@ -67,9 +218,25 @@ private void writeString(final String text) throws IOException {
67218
* @param node the current node to process
68219
* @param text the {@link StringBuilder} to append the extracted text to
69220
*/
221+
// --------------------------------------------------------------------
222+
// [INTERNAL NOTE] TTML text layer explanation
223+
//
224+
// TTML parsing involves multiple text "layers":
225+
// 1. Raw XML entities (e.g., &lt;, &#xA0;) are decoded by Jsoup.
226+
// 2. extractText() works on DOM TextNodes (already parsed strings).
227+
// 3. sanitizeFragment() decodes remaining entities and fixes
228+
// Unicode quirks.
229+
// 4. normalizeForSrt() ensures literal text is safe for SRT output.
230+
//
231+
// In short:
232+
// Jsoup handles XML-level syntax,
233+
// our code handles text-level normalization for subtitles.
234+
// --------------------------------------------------------------------
70235
private void extractText(final Node node, final StringBuilder text) {
71236
if (node instanceof TextNode textNode) {
72-
text.append((textNode).text());
237+
String rawTtmlFragment = textNode.getWholeText();
238+
String srtContent = sanitizeFragment(rawTtmlFragment);
239+
text.append(srtContent);
73240
} else if (node instanceof Element element) {
74241
// <br> is a self-closing HTML tag used to insert a line break.
75242
if (element.tagName().equalsIgnoreCase("br")) {

0 commit comments

Comments
 (0)