@@ -53,6 +53,12 @@ public final class SubtitleDeduplicator {
5353
5454 private static final float BACKOFF_FACTOR = 1.0f ;
5555
56+ // Once NewPipe/ExoPlayer supports styled subtitle rendering
57+ // (e.g., colors, bold, background), set this to 'true'
58+ // to preserve different styles for the same subtitle text
59+ // in consecutive subtitle entries.
60+ private static final boolean SUPPORT_STYLED_SUBTITLE_RENDERING = false ;
61+
5662 private static String subCacheDir = "subtitle_cache" ;
5763
5864 private static File cacheDir = null ;
@@ -250,6 +256,7 @@ public static boolean containsDuplicateTtmlEntries(final File subtitleFile) {
250256 // Detects whether the subtitle contains duplicated <p> entries
251257 // using the same normalized (whitespace-trimmed) comparison rules
252258 // as deduplicateContent().
259+ // Note: entry == paragraph
253260 public static boolean containsDuplicatedEntries (final String subtitleContent ) {
254261 if (stringIsNullOrEmpty (subtitleContent )) {
255262 return false ;
@@ -296,7 +303,7 @@ public static String deduplicateTtmlFile(final File subtitleFile) {
296303 }
297304
298305 public static String deduplicateContent (final String subtitleContent ) {
299- // Subtitle entries are considered duplicated only if:
306+ // Subtitle entries/paragraphs are considered duplicated only if:
300307 // 1) begin timestamp is exactly the same,
301308 // 2) end timestamp is exactly the same,
302309 // 3) subtitle text content is the same
@@ -360,25 +367,89 @@ private static String getSubtitleKeyOfTtml(final Matcher matcher) {
360367 final String begin = matcher .group (1 ).trim ();
361368 final String end = matcher .group (2 ).trim ();
362369
370+ // Textual content units inside the <p> element.
371+ // It may contain <span style="..."> tags/attributes.
372+ final String rawTextualContent = matcher .group (3 );
373+
374+ String textContent = null ;
363375 // Normalize subtitle text before comparison:
364- // - Leading and trailing whitespace is ignored
365- // - Runs of whitespace are collapsed into a single space (' ')
366376 //
367377 // Note:
368378 // This operates on raw TTML text as received (before XML entity decoding).
369379 // XML-encoded whitespace (e.g. 	) is not decoded at this stage.
370380 //
371- // This is intentional: visually identical subtitles may differ only
372- // in whitespace due to formatting or extraction differences, and
373- // should be considered duplicates in such cases.
374- final String content = matcher .group (3 )
375- .trim ()
376- .replaceAll ("\\ s+" , " " );
377-
378- final String key = begin + "|" + end + "|" + content ;
381+ if (!SUPPORT_STYLED_SUBTITLE_RENDERING ) {
382+ // Purpose:
383+ // Some subtitles have the same text but different style
384+ // attributes (e.g., colors, bold).
385+ // If NewPipe does not support styled subtitle rendering,
386+ // style attributes are meaningless, so they are ignored
387+ // during deduplication.
388+ //
389+ // Example:
390+ // <p begin="00:00:11.452" end="00:00:14.388" style="s2">
391+ // <span style="s3">Magic</span>
392+ // </p>
393+ // <p begin="00:00:11.452" end="00:00:14.388" style="s2">
394+ // <span style="s11">Magic</span>
395+ // </p>
396+ // These two subtitles have the same visible text but
397+ // different style attributes. They will be considered
398+ // duplicates after stripping style tags.
399+ //
400+ // Note:
401+ // It may still contain <br> tags, which we intentionally
402+ // keep for semantic meaning.
403+ final String textWithoutStyles = stripStyleTags (rawTextualContent );
404+ textContent = normalizeParagraphText (textWithoutStyles );
405+ } else {
406+ textContent = normalizeParagraphText (rawTextualContent );
407+ }
408+
409+ final String key = begin + "|" + end + "|" + textContent ;
379410 return key ;
380411 }
381412
413+ private static String stripStyleTags (final String textualContent ) {
414+ return textualContent
415+ .replaceAll ("<span[^>]*>" , "" )
416+ .replaceAll ("</span>" , "" );
417+ }
418+
419+ private static String normalizeParagraphText (final String textContent ) {
420+ if (textContent == null ) {
421+ return "" ;
422+ }
423+
424+ final String normalized = textContent
425+ // Remove invisible Unicode characters
426+ // Reason:
427+ // Two subtitle entries may look the same visually, but
428+ // they may differ in code due to invisible characters.
429+ // Removing them ensures proper detection of duplicated
430+ // subtitles.
431+ // Covered characters:
432+ // - Zero-width spaces and related characters (U+200B to U+200D)
433+ // - Directionality control characters (U+200E, U+200F)
434+ // - Directionality formatting characters (U+202A to U+202E)
435+ // - Byte Order Mark (BOM, U+FEFF)
436+ .replaceAll ("[\\ u200B-\\ u200F\\ u202A-\\ u202E\\ uFEFF]" , "" )
437+
438+ // normalize non-breaking space to normal space
439+ .replace ('\u00A0' , ' ' )
440+
441+ // Runs of whitespace are collapsed into a single space (' ')
442+ // This is intentional: visually identical subtitles
443+ // may differ only in whitespace due to formatting or
444+ // extraction, and should still be considered duplicates.
445+ .replaceAll ("\\ s+" , " " )
446+
447+ // Leading and trailing whitespace is ignored
448+ .trim ();
449+
450+ return normalized ;
451+ }
452+
382453 private static String buildLocalFileUri (final File subtitleCacheFile ) {
383454 final String path = LOCAL_SUBTITLE_URL_PREFIX + subtitleCacheFile .getAbsolutePath ();
384455
0 commit comments