@@ -59,21 +59,22 @@ public final class SubtitleDeduplicator {
5959 // in consecutive subtitle entries.
6060 private static final boolean SUPPORT_STYLED_SUBTITLE_RENDERING = false ;
6161
62- private static String subCacheDir = "subtitle_cache" ;
62+ private static final String SUBTITLE_DEDUP_CACHE_DIR = "subtitle_cache" ;
6363
6464 private static File cacheDir = null ;
6565
6666 private SubtitleDeduplicator () {
6767 // no instance
6868 }
6969
70- // cacheDir is /storage/emulated/0/Android/data/<package_name>/cache/{subCacheDir}
70+ // cacheDir is /storage/emulated/0/Android/data/<package_name>/
71+ // cache/{SUBTITLE_DEDUP_CACHE_DIR}
7172 public static void setCacheDirPath (final String path ) {
7273 if (stringIsNullOrEmpty (path )) {
7374 return ;
7475 }
7576
76- cacheDir = new File (path , subCacheDir );
77+ cacheDir = new File (path , SUBTITLE_DEDUP_CACHE_DIR );
7778
7879 createDirIfNotExist (cacheDir );
7980 }
@@ -136,7 +137,7 @@ public static String checkAndDeduplicate(final String remoteSubtitleUrl,
136137 currentSubtitleOrigin ,
137138 currentSubtitleState );
138139
139- final String localSubtitleUri = storeItToCacheDir (finalContent ,
140+ final String localSubtitleUri = writeContentToCacheFile (finalContent ,
140141 format ,
141142 currentSubtitleOrigin ,
142143 currentCacheFile );
@@ -257,21 +258,21 @@ public static boolean containsDuplicateTtmlEntries(final File subtitleFile) {
257258 // using the same normalized (whitespace-trimmed) comparison rules
258259 // as deduplicateContent().
259260 // Note: entry == paragraph
260- public static boolean containsDuplicatedEntries (final String subtitleContent ) {
261- if (stringIsNullOrEmpty (subtitleContent )) {
261+ public static boolean containsDuplicatedEntries (final String ttmlFileContent ) {
262+ if (stringIsNullOrEmpty (ttmlFileContent )) {
262263 return false ;
263264 }
264265
265- final Matcher matcher = getTtmlMatcher (subtitleContent );
266+ final Matcher matcher = getTtmlMatcher (ttmlFileContent );
266267
267- final Set <String > seen = new HashSet <>();
268+ final Set <String > processedKeys = new HashSet <>();
268269 while (matcher .find ()) {
269- final String key = getSubtitleKeyOfTtml (matcher );
270+ final String currentParagraphKey = buildDeduplicationKey (matcher );
270271
271- if (seen .contains (key )) {
272+ if (processedKeys .contains (currentParagraphKey )) {
272273 return true ;
273274 }
274- seen .add (key );
275+ processedKeys .add (currentParagraphKey );
275276 }
276277
277278 return false ;
@@ -302,7 +303,7 @@ public static String deduplicateTtmlFile(final File subtitleFile) {
302303 }
303304 }
304305
305- public static String deduplicateContent (final String subtitleContent ) {
306+ public static String deduplicateContent (final String ttmlFileContent ) {
306307 // Subtitle entries/paragraphs are considered duplicated only if:
307308 // 1) begin timestamp is exactly the same,
308309 // 2) end timestamp is exactly the same,
@@ -312,30 +313,50 @@ public static String deduplicateContent(final String subtitleContent) {
312313 // This is a normalized comparison (trimmed and whitespace-normalized).
313314 // No semantic analysis or fuzzy matching is performed.
314315
315- if (stringIsNullOrEmpty (subtitleContent )) {
316- return subtitleContent ;
316+ if (stringIsNullOrEmpty (ttmlFileContent )) {
317+ return ttmlFileContent ;
317318 }
318319
319- final Matcher matcher = getTtmlMatcher (subtitleContent );
320-
321- final Set <String > seen = new HashSet <>();
320+ final Set <String > processedKeys = new HashSet <>();
322321 final StringBuilder result = new StringBuilder ();
323322
324- int lastIndex = 0 ;
325- while (matcher .find ()) {
326- result .append (subtitleContent , lastIndex , matcher .start ());
323+ // Create a matcher for all <p>...</p> entries
324+ final Matcher matcher = getTtmlMatcher (ttmlFileContent );
327325
328- final String key = getSubtitleKeyOfTtml (matcher );
326+ // Keep track of the end index of the last processed <p>
327+ int lastParagraphEndIndex = 0 ;
329328
330- if (!seen .contains (key )) {
331- result .append (matcher .group (0 ));
332- seen .add (key );
329+ while (matcher .find ()) {
330+ // Extract the gap between the previous <p> and the current <p>
331+ // - it may contain whitespace, newlines, or other XML elements.
332+ // - it is NOT part of the subtitle paragraph.
333+ // - It is never used for deduplication or screen display.
334+ final String gapBetweenParagraphs = ttmlFileContent .substring (
335+ lastParagraphEndIndex ,
336+ matcher .start ()
337+ );
338+ result .append (gapBetweenParagraphs );
339+
340+ final String currentParagraph = matcher .group (0 );
341+ final String currentParagraphKey = buildDeduplicationKey (matcher );
342+
343+ if (!processedKeys .contains (currentParagraphKey )) {
344+ // Append the ORIGINAL full <p> paragraph.
345+ // - This preserves the author's original formatting
346+ // (runs of whitespace, <br>, etc.).
347+ result .append (currentParagraph );
348+ processedKeys .add (currentParagraphKey );
333349 }
334350
335- lastIndex = matcher .end ();
351+ // Move the last processed index to the end of the current <p>
352+ lastParagraphEndIndex = matcher .end ();
336353 }
337354
338- result .append (subtitleContent .substring (lastIndex ));
355+ // Append any remaining content after the last <p>.
356+ // - Usually contains closing tags like </div>, </body>, </tt>.
357+ final String trailingContent = ttmlFileContent .substring (lastParagraphEndIndex );
358+ result .append (trailingContent );
359+
339360 return result .toString ();
340361 }
341362
@@ -363,7 +384,18 @@ private static Matcher getTtmlMatcher(final String subtitleContent) {
363384 return pattern .matcher (subtitleContent );
364385 }
365386
366- private static String getSubtitleKeyOfTtml (final Matcher matcher ) {
387+ /**
388+ * Generates a deduplication key for one TTML {@code <p>} paragraph.
389+ *
390+ * @param matcher Matcher already positioned on a single {@code <p>} element.
391+ * group(1) = begin time
392+ * group(2) = end time
393+ * group(3) = raw textual content (may contain 'span' tags)
394+ * @return a deduplication key composed of begin/end timestamps
395+ * and normalized text, used to detect whether this subtitle entry
396+ * has already been processed.
397+ */
398+ private static String buildDeduplicationKey (final Matcher matcher ) {
367399 final String begin = matcher .group (1 ).trim ();
368400 final String end = matcher .group (2 ).trim ();
369401
@@ -456,22 +488,20 @@ private static String buildLocalFileUri(final File subtitleCacheFile) {
456488 return path ;
457489 }
458490
459- private static String storeItToCacheDir (final String subtitleContent ,
491+ private static String writeContentToCacheFile (final String subtitleContent ,
460492 final MediaFormat format ,
461493 final SubtitleOrigin currentSubtitleOrigin ,
462494 final File currentCacheFile ) {
463- final File cacheFile = currentCacheFile ;
495+ final String cacheFilePathForExoplayer = buildLocalFileUri ( currentCacheFile ) ;
464496
465- final String cacheFilePathForExoplayer = buildLocalFileUri (cacheFile );
466-
467- if (!ensureItsParentDirExist (cacheFile )) {
497+ if (!ensureItsParentDirExist (currentCacheFile )) {
468498 return null ;
469499 }
470500
471- if (null == writeDeduplicatedContentToCachefile (subtitleContent , cacheFile )) {
501+ if (null == writeContentToFile (subtitleContent , currentCacheFile )) {
472502 return cacheFilePathForExoplayer ;
473503 } else {
474- Log .e (TAG , "Failed to write cache file: " + cacheFile .getAbsolutePath ());
504+ Log .e (TAG , "Failed to write cache file: " + currentCacheFile .getAbsolutePath ());
475505 return null ;
476506 }
477507 }
@@ -526,9 +556,7 @@ private static String buildSubtitleCacheFilename(
526556 }
527557
528558 private static String getLanguageCode (final String remoteSubtitleUrl ) {
529- String languageCode = null ;
530- languageCode = YoutubeParsingHelper .extractLanguageCode (remoteSubtitleUrl );
531- return languageCode ;
559+ return YoutubeParsingHelper .extractLanguageCode (remoteSubtitleUrl );
532560 }
533561
534562 private static String getAutoTranslateLanguage (final String remoteSubtitleUrl ) {
@@ -653,12 +681,6 @@ private static boolean ensureItsParentDirExist(final File tempCacheFile) {
653681 }
654682 }
655683
656- private static String writeDeduplicatedContentToCachefile (
657- final String subtitleContent ,
658- final File tempCacheFile ) {
659- return writeContentToFile (subtitleContent , tempCacheFile );
660- }
661-
662684 private static String writeContentToFile (final String content ,
663685 final File tempFile ) {
664686 try (BufferedWriter writer = new BufferedWriter (new OutputStreamWriter (
0 commit comments