Skip to content

Commit 8d119f5

Browse files
committed
[duplicated subtitle] Add methods in SubtitleDeduplicator to ignore style tags and normalize subtitle text content
- Helps handle YouTube subtitles that have different style attributes but the same text and timestamps - Add SUPPORT_STYLED_SUBTITLE_RENDERING flag for future styled subtitle support (currently not supported in NewPipe) - Remove invisible Unicode characters (zero-width and directionality controls) - Handle non-breaking spaces, BOM (U+FEFF), multiple spaces, and leading/trailing spaces - This commit is tested with: https://www.youtube.com/watch?v=7w3jBGX7UcY
1 parent 6818991 commit 8d119f5

1 file changed

Lines changed: 82 additions & 11 deletions

File tree

app/src/main/java/org/schabi/newpipe/util/subtitle/SubtitleDeduplicator.java

Lines changed: 82 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,12 @@ public final class SubtitleDeduplicator {
5353

5454
private static final float BACKOFF_FACTOR = 1.0f;
5555

56+
// Once NewPipe/ExoPlayer supports styled subtitle rendering
57+
// (e.g., colors, bold, background), set this to 'true'
58+
// to preserve different styles for the same subtitle text
59+
// in consecutive subtitle entries.
60+
private static final boolean SUPPORT_STYLED_SUBTITLE_RENDERING = false;
61+
5662
private static String subCacheDir = "subtitle_cache";
5763

5864
private static File cacheDir = null;
@@ -250,6 +256,7 @@ public static boolean containsDuplicateTtmlEntries(final File subtitleFile) {
250256
// Detects whether the subtitle contains duplicated <p> entries
251257
// using the same normalized (whitespace-trimmed) comparison rules
252258
// as deduplicateContent().
259+
// Note: entry == paragraph
253260
public static boolean containsDuplicatedEntries(final String subtitleContent) {
254261
if (stringIsNullOrEmpty(subtitleContent)) {
255262
return false;
@@ -296,7 +303,7 @@ public static String deduplicateTtmlFile(final File subtitleFile) {
296303
}
297304

298305
public static String deduplicateContent(final String subtitleContent) {
299-
// Subtitle entries are considered duplicated only if:
306+
// Subtitle entries/paragraphs are considered duplicated only if:
300307
// 1) begin timestamp is exactly the same,
301308
// 2) end timestamp is exactly the same,
302309
// 3) subtitle text content is the same
@@ -360,25 +367,89 @@ private static String getSubtitleKeyOfTtml(final Matcher matcher) {
360367
final String begin = matcher.group(1).trim();
361368
final String end = matcher.group(2).trim();
362369

370+
// Textual content units inside the <p> element.
371+
// It may contain <span style="..."> tags/attributes.
372+
final String rawTextualContent = matcher.group(3);
373+
374+
String textContent = null;
363375
// Normalize subtitle text before comparison:
364-
// - Leading and trailing whitespace is ignored
365-
// - Runs of whitespace are collapsed into a single space (' ')
366376
//
367377
// Note:
368378
// This operates on raw TTML text as received (before XML entity decoding).
369379
// XML-encoded whitespace (e.g. &#x9;) is not decoded at this stage.
370380
//
371-
// This is intentional: visually identical subtitles may differ only
372-
// in whitespace due to formatting or extraction differences, and
373-
// should be considered duplicates in such cases.
374-
final String content = matcher.group(3)
375-
.trim()
376-
.replaceAll("\\s+", " ");
377-
378-
final String key = begin + "|" + end + "|" + content;
381+
if (!SUPPORT_STYLED_SUBTITLE_RENDERING) {
382+
// Purpose:
383+
// Some subtitles have the same text but different style
384+
// attributes (e.g., colors, bold).
385+
// If NewPipe does not support styled subtitle rendering,
386+
// style attributes are meaningless, so they are ignored
387+
// during deduplication.
388+
//
389+
// Example:
390+
// <p begin="00:00:11.452" end="00:00:14.388" style="s2">
391+
// <span style="s3">Magic</span>
392+
// </p>
393+
// <p begin="00:00:11.452" end="00:00:14.388" style="s2">
394+
// <span style="s11">Magic</span>
395+
// </p>
396+
// These two subtitles have the same visible text but
397+
// different style attributes. They will be considered
398+
// duplicates after stripping style tags.
399+
//
400+
// Note:
401+
// It may still contain <br> tags, which we intentionally
402+
// keep for semantic meaning.
403+
final String textWithoutStyles = stripStyleTags(rawTextualContent);
404+
textContent = normalizeParagraphText(textWithoutStyles);
405+
} else {
406+
textContent = normalizeParagraphText(rawTextualContent);
407+
}
408+
409+
final String key = begin + "|" + end + "|" + textContent;
379410
return key;
380411
}
381412

413+
private static String stripStyleTags(final String textualContent) {
414+
return textualContent
415+
.replaceAll("<span[^>]*>", "")
416+
.replaceAll("</span>", "");
417+
}
418+
419+
private static String normalizeParagraphText(final String textContent) {
420+
if (textContent == null) {
421+
return "";
422+
}
423+
424+
final String normalized = textContent
425+
// Remove invisible Unicode characters
426+
// Reason:
427+
// Two subtitle entries may look the same visually, but
428+
// they may differ in code due to invisible characters.
429+
// Removing them ensures proper detection of duplicated
430+
// subtitles.
431+
// Covered characters:
432+
// - Zero-width spaces and related characters (U+200B to U+200D)
433+
// - Directionality control characters (U+200E, U+200F)
434+
// - Directionality formatting characters (U+202A to U+202E)
435+
// - Byte Order Mark (BOM, U+FEFF)
436+
.replaceAll("[\\u200B-\\u200F\\u202A-\\u202E\\uFEFF]", "")
437+
438+
// normalize non-breaking space to normal space
439+
.replace('\u00A0', ' ')
440+
441+
// Runs of whitespace are collapsed into a single space (' ')
442+
// This is intentional: visually identical subtitles
443+
// may differ only in whitespace due to formatting or
444+
// extraction, and should still be considered duplicates.
445+
.replaceAll("\\s+", " ")
446+
447+
// Leading and trailing whitespace is ignored
448+
.trim();
449+
450+
return normalized;
451+
}
452+
382453
private static String buildLocalFileUri(final File subtitleCacheFile) {
383454
final String path = LOCAL_SUBTITLE_URL_PREFIX + subtitleCacheFile.getAbsolutePath();
384455

0 commit comments

Comments
 (0)