@@ -54,6 +54,157 @@ private void writeString(final String text) throws IOException {
5454 out .write (text .getBytes (charset ));
5555 }
5656
57+ /**
58+ * Decode XML or HTML entities into their actual (literal) characters.
59+ *
60+ * TTML is XML-based, so text nodes may contain escaped entities
61+ * instead of direct characters. For example:
62+ *
63+ * "&" → "&"
64+ * "<" → "<"
65+ * ">" → ">"
66+ * "	" → "\t" (TAB)
67+ * "
" ( ) → "\n" (LINE FEED)
68+ *
69+ * XML files cannot contain characters like "<", ">", "&" directly,
70+ * so they must be represented using their entity-encoded forms.
71+ *
72+ * Jsoup sometimes leaves nested or encoded entities unresolved
73+ * (e.g. inside <p> text nodes in TTML files), so this function
74+ * acts as a final “safety net” to ensure all entities are decoded
75+ * before further normalization.
76+ *
77+ * Character representation layers for reference:
78+ * - Literal characters: <, >, &
79+ * → appear in runtime/output text (e.g. final SRT output)
80+ * - Escaped entities: <, >, &
81+ * → appear in XML/HTML/TTML source files
82+ * - Numeric entities:  , 	, 
83+ * → appear mainly in XML/TTML files (also valid in HTML)
84+ * for non-printable or special characters
85+ * - Unicode escapes: \u00A0 (Java/Unicode internal form)
86+ * → appear only in Java source code (NOT valid in XML)
87+ *
88+ * XML entities include both named (&, <) and numeric
89+ * ( ,  ) forms.
90+ *
91+ * @param encodedEntities The raw text fragment possibly containing
92+ * encoded XML entities.
93+ * @return A decoded string where all entities are replaced by their
94+ * actual (literal) characters.
95+ */
96+ private String decodeXmlEntities (final String encodedEntities ) {
97+ final String decoded = Parser .unescapeEntities (encodedEntities , true );
98+ return decoded ;
99+ }
100+
101+ /**
102+ * Handle rare XML entity characters like LF: 
(`\n`)
103+ * , CR: 
(`\r`) and CRLF: (`\r\n`).
104+ *
105+ * These are technically valid in TTML (XML allows them)
106+ * but unusual in practice, since most TTML line breaks
107+ * are represented as <br/> tags instead.
108+ * As a defensive approach, we normalize them:
109+ *
110+ * - Windows (\r\n), macOS (\r), and Unix (\n) → unified SRT NEW_LINE (\r\n)
111+ *
112+ * Although well-formed TTML normally encodes line breaks
113+ * as <br/> tags, some auto-generated or malformed TTML files
114+ * may embed literal newline entities (
, 
). This
115+ * normalization ensures these cases render properly in SRT
116+ * players instead of breaking the subtitle structure.
117+ *
118+ * @param text To be normalized text with actual characters.
119+ * @return Unified SRT NEW_LINE converted from all kinds of line breaks.
120+ */
121+ private String normalizeLineBreakForSrt (final String text ) {
122+ String cleaned = text ;
123+
124+ // NOTE:
125+ // The order of newline replacements must NOT change,
126+ // or duplicated line breaks (e.g. \r\n → \n\n) will occur.
127+ cleaned = cleaned .replace ("\r \n " , "\n " )
128+ .replace ("\r " , "\n " );
129+
130+ cleaned = cleaned .replace ("\n " , NEW_LINE );
131+
132+ return cleaned ;
133+ }
134+
135+ private String normalizeForSrt (final String actualText ) {
136+ String cleaned = actualText ;
137+
138+ // Replace non-breaking space (\u00A0) with regular space ' '(\u0020).
139+ // - YouTube TTML subtitles use both regular spaces (\u0020)
140+ // and non-breaking spaces (\u00A0).
141+ // - SRT subtitles only support regular spaces (\u0020),
142+ // so \u00A0 may cause display issues.
143+ // - \u00A0 and \u0020 are visually identical (i.e., they both
144+ // appear as spaces ' '), but they differ in Unicode encoding,
145+ // leading to test failures (e.g., ComparisonFailure).
146+ // - Convert \u00A0 to \u0020 to ensure consistency in subtitle
147+ // formatting.
148+ // - References:
149+ // - Unicode General Punctuation: https://unicode.org/charts/PDF/U2000.pdf
150+ // - TTML Spec: https://www.w3.org/TR/ttml2/
151+ // - SRT Format: https://en.wikipedia.org/wiki/SubRip
152+ cleaned = cleaned .replace ('\u00A0' , ' ' ) // Non-breaking space
153+ .replace ('\u202F' , ' ' ) // Narrow no-break space
154+ .replace ('\u205F' , ' ' ) // Medium mathematical space
155+ .replace ('\u3000' , ' ' ) // Ideographic space
156+ // \u2000 ~ \u200A are whitespace characters (e.g.,
157+ // en space, em space), replaced with regular space (\u0020).
158+ .replaceAll ("[\\ u2000-\\ u200A]" , " " ); // Whitespace characters
159+
160+ // \u200B ~ \u200F are a range of non-spacing characters
161+ // (e.g., zero-width space, zero-width non-joiner, etc.),
162+ // which have no effect in *.SRT files and may cause
163+ // display issues.
164+ // These characters are invisible to the human eye, and
165+ // they still exist in the encoding, so they need to be
166+ // removed.
167+ // After removal, the actual content becomes completely
168+ // empty "", meaning there are no characters left, just
169+ // an empty space, which helps avoid formatting issues
170+ // in subtitles.
171+ cleaned = cleaned .replaceAll ("[\\ u200B-\\ u200F]" , "" ); // Non-spacing characters
172+
173+ // Remove control characters (\u0000 ~ \u001F, except
174+ // \n, \r, \t).
175+ // - These are ASCII C0 control codes (e.g. \u0001 SOH,
176+ // \u0008 BS, \u001F US), invisible and irrelevant in
177+ // subtitles, may cause square boxes (?) in players.
178+ // - Reference:
179+ // Unicode Basic Latin (https://unicode.org/charts/PDF/U0000.pdf)
180+ // ASCII Control (https://en.wikipedia.org/wiki/ASCII#Control_characters)
181+ cleaned = cleaned .replaceAll ("[\\ u0000-\\ u0008\\ u000B\\ u000C\\ u000E-\\ u001F]" , "" );
182+
183+ // Reasoning:
184+ // - subtitle files generally don't require tabs for alignment.
185+ // - Tabs can be displayed with varying widths across different
186+ // editors or platforms, which may cause display issues.
187+ // - Replace it with a single space for consistent display
188+ // across different editors or platforms.
189+ cleaned = cleaned .replace ('\t' , ' ' );
190+
191+ cleaned = normalizeLineBreakForSrt (cleaned );
192+
193+ return cleaned ;
194+ }
195+
196+ private String sanitizeFragment (final String raw ) {
197+ if (null == raw ) {
198+ return "" ;
199+ }
200+
201+ final String actualCharacters = decodeXmlEntities (raw );
202+
203+ final String srtSafeText = normalizeForSrt (actualCharacters );
204+
205+ return srtSafeText ;
206+ }
207+
57208 // CHECKSTYLE:OFF checkstyle:JavadocStyle
58209 // checkstyle does not understand that span tags are inside a code block
59210 /**
@@ -67,9 +218,25 @@ private void writeString(final String text) throws IOException {
67218 * @param node the current node to process
68219 * @param text the {@link StringBuilder} to append the extracted text to
69220 */
221+ // --------------------------------------------------------------------
222+ // [INTERNAL NOTE] TTML text layer explanation
223+ //
224+ // TTML parsing involves multiple text "layers":
225+ // 1. Raw XML entities (e.g., <,  ) are decoded by Jsoup.
226+ // 2. extractText() works on DOM TextNodes (already parsed strings).
227+ // 3. sanitizeFragment() decodes remaining entities and fixes
228+ // Unicode quirks.
229+ // 4. normalizeForSrt() ensures literal text is safe for SRT output.
230+ //
231+ // In short:
232+ // Jsoup handles XML-level syntax,
233+ // our code handles text-level normalization for subtitles.
234+ // --------------------------------------------------------------------
70235 private void extractText (final Node node , final StringBuilder text ) {
71236 if (node instanceof TextNode textNode ) {
72- text .append ((textNode ).text ());
237+ String rawTtmlFragment = textNode .getWholeText ();
238+ String srtContent = sanitizeFragment (rawTtmlFragment );
239+ text .append (srtContent );
73240 } else if (node instanceof Element element ) {
74241 // <br> is a self-closing HTML tag used to insert a line break.
75242 if (element .tagName ().equalsIgnoreCase ("br" )) {
0 commit comments