From 4f827dd03d8a4d39ab1b5764091cc77178d96a87 Mon Sep 17 00:00:00 2001 From: TransZAllen <49811617+TransZAllen@users.noreply.github.com> Date: Wed, 1 Apr 2026 16:09:37 +0800 Subject: [PATCH 01/13] [duplicated subtitle] Move `SubtitleDeduplicator` from NewPipeExtractor to app side. [Bug] Fix duplicated subtitle issue. - Add core deduplicated logic/method - Reproduce bug with the YouTube video: https://www.youtube.com/watch?v=b7vmW_5HSpE (Observed around 2026-03-03: the subtitle language that previously had duplication issue no longer appears in the captions list) - Introduce `SubtitleDeduplicator.java` to check and remove duplicates, storing results in cache. - Add `SubtitleOrigin` and `SubtitleState` enums to model subtitle type and state. - Ensure cache directory is recreated if missing. --- .../util/subtitle/SubtitleDeduplicator.java | 588 ++++++++++++++++++ .../newpipe/util/subtitle/SubtitleOrigin.java | 38 ++ .../newpipe/util/subtitle/SubtitleState.java | 31 + 3 files changed, 657 insertions(+) create mode 100644 app/src/main/java/org/schabi/newpipe/util/subtitle/SubtitleDeduplicator.java create mode 100644 app/src/main/java/org/schabi/newpipe/util/subtitle/SubtitleOrigin.java create mode 100644 app/src/main/java/org/schabi/newpipe/util/subtitle/SubtitleState.java diff --git a/app/src/main/java/org/schabi/newpipe/util/subtitle/SubtitleDeduplicator.java b/app/src/main/java/org/schabi/newpipe/util/subtitle/SubtitleDeduplicator.java new file mode 100644 index 00000000000..70ecb147e93 --- /dev/null +++ b/app/src/main/java/org/schabi/newpipe/util/subtitle/SubtitleDeduplicator.java @@ -0,0 +1,588 @@ +package org.schabi.newpipe.extractor.utils; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileReader; +import java.io.BufferedWriter; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.OutputStreamWriter; +import java.nio.charset.StandardCharsets; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import javax.annotation.Nonnull; + +import org.schabi.newpipe.extractor.services.youtube.YoutubeParsingHelper; +import org.schabi.newpipe.extractor.MediaFormat; +import org.schabi.newpipe.extractor.NewPipe; +import org.schabi.newpipe.extractor.downloader.Downloader; +import org.schabi.newpipe.extractor.downloader.Response; +import org.schabi.newpipe.extractor.exceptions.ReCaptchaException; + +/** + * SubtitleDeduplicator.java + * + * 1. This file is responsible for checking if the subtitles + * contain any duplicate entries. + * a) If duplicates are found, it performs the following steps: + * downloads the subtitle (TTML format), deduplicates it, + * and stores it locally. + * b) If no duplicates are found, downloads and stores it. + * + * 2. Core Functions: + * - checkAndDeduplicate(): Checks for duplicate subtitles + * and handles downloading, deduplication, and local storage. + * + */ + +public final class SubtitleDeduplicator { + private static final String TAG = "SubtitleDeduplicator"; + public static final String LOCAL_SUBTITLE_URL_PREFIX = "file://"; + + private static final float BACKOFF_FACTOR = 1.0f; + + private static String subCacheDir = "subtitle_cache"; + + private static File cacheDir = null; + + private SubtitleDeduplicator() { + // no instance + } + + // cacheDir is /storage/emulated/0/Android/data//cache/{subCacheDir} + public static void setCacheDirPath(final String path) { + if (stringIsNullOrEmpty(path)) { + return; + } + + cacheDir = new File(path, subCacheDir); + + createDirIfNotExist(cacheDir); + } + + // Returns either a remote subtitle URL or a local file URI (file://) + // @param remoteSubtitleUrl: A valid YouTube subtitle URL, expected to + // contain videoId and languageCode parameters. + public static String checkAndDeduplicate(final String remoteSubtitleUrl, + final MediaFormat format, + final SubtitleOrigin currentSubtitleOrigin) { + if (!isCacheDirAvailable()) { + printCacheDirNotInitialized(); + return remoteSubtitleUrl; + } + // *** Step 1: Download remote subtitle content + + // - The remote subtitle is ALWAYS downloaded to ensure + // the newest version is used. + // - Although cached subtitles are available, they may be + // outdated since the video creator or the YouTube + // platform can update them. + + // Current subtitle format is TTML + final String downloadedContent = downloadRemoteSubtitleContent( + remoteSubtitleUrl, + currentSubtitleOrigin, + 3, + 1000); + + if (subtitleDownloadFails(downloadedContent)) { + return fallbackToStoredOrRemote(remoteSubtitleUrl, + format, + currentSubtitleOrigin); + } + + String finalContent = null; + SubtitleState currentSubtitleState = SubtitleState.ORIGINAL; + + // *** Step 2: Detect and deduplicate if needed + + if (containsDuplicatedEntries(downloadedContent)) { + finalContent = deduplicateContent(downloadedContent); + currentSubtitleState = SubtitleState.DEDUPLICATED; + } else { + finalContent = downloadedContent; + currentSubtitleState = SubtitleState.ORIGINAL; + } + + // *** Step 3: Store subtitle to cache and return local URI if possible + + final File currentCacheFile = getCacheFile(remoteSubtitleUrl, + format, + currentSubtitleOrigin, + currentSubtitleState); + + final String localSubtitleUri = storeItToCacheDir(finalContent, + format, + currentSubtitleOrigin, + currentCacheFile); + + if (subtitleStorageFails(localSubtitleUri)) { + return fallbackToStoredOrRemote(remoteSubtitleUrl, + format, + currentSubtitleOrigin); + } + + return localSubtitleUri; + } + + private static boolean isCacheDirAvailable() { + if (null == cacheDir) { + return false; + } + + return createDirIfNotExist(cacheDir); + } + + private static boolean createDirIfNotExist(final File directory) { + if (!directory.exists()) { + directory.mkdirs(); + } + + return ((directory.exists()) && (directory.isDirectory())); + } + + private static void printCacheDirNotInitialized() { + final String errorMessage = + "SubtitleDeduplicator cache directory is not initialized. " + + "Fallback to original subtitle without deduplication. " + + "setCacheDirPath() should be called before using this class."; + + System.err.println(TAG + ": " + errorMessage); + } + + private static String downloadRemoteSubtitleContent(final String urlStr, + final SubtitleOrigin currentOrigin, + final int maxRetries, + final int initialDelayMillis) { + final Downloader downloader = NewPipe.getDownloader(); + if (downloader == null) { + System.err.println(TAG + ": Downloader not initialized"); + return null; + } + // if auto-translate language subtitle, use the bigger data. + int delay = resolveDelay(currentOrigin, initialDelayMillis); + for (int attempt = 1; attempt <= maxRetries; attempt++) { + try { + final Map> headers = new HashMap<>(); + headers.put("Accept", Collections.singletonList("text/*")); + headers.put("Accept-Language", Collections.singletonList("en-US,en;q=0.9")); + final Response response = downloader.get(urlStr, headers); + if (response.responseCode() == 200) { + return response.responseBody(); + } else { + System.err.println(TAG + ": Attempt " + attempt + + " failed with status: " + + response.responseCode() + + " URL: " + urlStr); + if (response.responseCode() != 503 && response.responseCode() != 429) { + return null; + } + } + } catch (IOException | ReCaptchaException e) { + System.err.println(TAG + ": Attempt " + attempt + + " failed: " + e.getMessage() + + " URL: " + urlStr); + } + if (attempt < maxRetries) { + try { + Thread.sleep(delay); + delay = adjustDelayAfterRetry(delay); + } catch (final InterruptedException ie) { + Thread.currentThread().interrupt(); + return null; + } + } + } + System.err.println(TAG + ": Failed to download subtitle after " + + maxRetries + " URL: " + urlStr); + return null; + } + + private static boolean isAutoTranslateSubtitle(final SubtitleOrigin currentOrigin) { + return (currentOrigin == SubtitleOrigin.AUTO_TRANSLATED); + } + + private static int resolveDelay(final SubtitleOrigin currentOrigin, + final int baseDelayMillis) { + if (isAutoTranslateSubtitle(currentOrigin)) { + // Auto-translated subtitles are observed to be less reliable. + // A separate delay path is kept to allow future tuning without + // affecting the common subtitle download flow. + return (baseDelayMillis + 1); + } else { + return baseDelayMillis; + } + } + + private static int adjustDelayAfterRetry(final int currentDelayMillis) { + return (int) (currentDelayMillis * BACKOFF_FACTOR); + } + + public static boolean containsDuplicateTtmlEntries(final File subtitleFile) { + if (subtitleFile == null || !subtitleFile.exists()) { + return false; + } + + try { + final String content = readFileToString(subtitleFile); + return containsDuplicatedEntries(content); + } catch (final IOException e) { + e.printStackTrace(); + return false; + } + } + + // Detects whether the subtitle contains duplicated

entries + // using the same normalized (whitespace-trimmed) comparison rules + // as deduplicateContent(). + public static boolean containsDuplicatedEntries(final String subtitleContent) { + if (stringIsNullOrEmpty(subtitleContent)) { + return false; + } + + final Matcher matcher = getTtmlMatcher(subtitleContent); + + final Set seen = new HashSet<>(); + while (matcher.find()) { + final String key = getSubtitleKeyOfTtml(matcher); + + if (seen.contains(key)) { + return true; + } + seen.add(key); + } + + return false; + } + + private static String readFileToString(final File file) throws IOException { + final StringBuilder sb = new StringBuilder(); + try (BufferedReader reader = new BufferedReader(new FileReader(file))) { + String line; + while ((line = reader.readLine()) != null) { + sb.append(line).append("\n"); + } + } + return sb.toString(); + } + + public static String deduplicateTtmlFile(final File subtitleFile) { + if (subtitleFile == null || !subtitleFile.exists()) { + return ""; + } + + try { + final String content = readFileToString(subtitleFile); + return deduplicateContent(content); + } catch (final IOException e) { + e.printStackTrace(); + return ""; + } + } + + public static String deduplicateContent(final String subtitleContent) { + // Subtitle entries are considered duplicated only if: + // 1) begin timestamp is exactly the same, + // 2) end timestamp is exactly the same, + // 3) subtitle text content is the same + // after normalized (trimming and whitespace normalization). + // + // This is a normalized comparison (trimmed and whitespace-normalized). + // No semantic analysis or fuzzy matching is performed. + + if (stringIsNullOrEmpty(subtitleContent)) { + return subtitleContent; + } + + final Matcher matcher = getTtmlMatcher(subtitleContent); + + final Set seen = new HashSet<>(); + final StringBuilder result = new StringBuilder(); + + int lastIndex = 0; + while (matcher.find()) { + result.append(subtitleContent, lastIndex, matcher.start()); + + final String key = getSubtitleKeyOfTtml(matcher); + + if (!seen.contains(key)) { + result.append(matcher.group(0)); + seen.add(key); + } + + lastIndex = matcher.end(); + } + + result.append(subtitleContent.substring(lastIndex)); + return result.toString(); + } + + private static boolean stringIsNullOrEmpty(final String inputString) { + if (null == inputString) { + return true; + } + + if (inputString.isEmpty()) { + return true; + } + + return false; + } + + private static Pattern defineTtmlSubtitlePattern() { + return Pattern.compile( + "]*begin=\"([^\"]+)\"[^>]*end=\"([^\"]+)\"[^>]*>(.*?)

", + Pattern.DOTALL + ); + } + + private static Matcher getTtmlMatcher(final String subtitleContent) { + final Pattern pattern = defineTtmlSubtitlePattern(); + return pattern.matcher(subtitleContent); + } + + private static String getSubtitleKeyOfTtml(final Matcher matcher) { + final String begin = matcher.group(1).trim(); + final String end = matcher.group(2).trim(); + + // Normalize subtitle text before comparison: + // - Leading and trailing whitespace is ignored + // - Runs of whitespace are collapsed into a single space (' ') + // + // Note: + // This operates on raw TTML text as received (before XML entity decoding). + // XML-encoded whitespace (e.g. ) is not decoded at this stage. + // + // This is intentional: visually identical subtitles may differ only + // in whitespace due to formatting or extraction differences, and + // should be considered duplicates in such cases. + final String content = matcher.group(3) + .trim() + .replaceAll("\\s+", " "); + + final String key = begin + "|" + end + "|" + content; + return key; + } + + private static String buildLocalFileUri(final File subtitleCacheFile) { + final String path = LOCAL_SUBTITLE_URL_PREFIX + subtitleCacheFile.getAbsolutePath(); + + return path; + } + + private static String storeItToCacheDir(final String subtitleContent, + final MediaFormat format, + final SubtitleOrigin currentSubtitleOrigin, + final File currentCacheFile) { + final File cacheFile = currentCacheFile; + + final String cacheFilePathForExoplayer = buildLocalFileUri(cacheFile); + + if (!ensureItsParentDirExist(cacheFile)) { + return null; + } + + if (null == writeDeduplicatedContentToCachefile(subtitleContent, cacheFile)) { + return cacheFilePathForExoplayer; + } else { + System.err.println(TAG + ": Failed to write cache file: " + + cacheFile.getAbsolutePath()); + return null; + } + } + + // filename without dir path + private static String computeFilename(final String subtitleUrl, + final MediaFormat format, + final SubtitleOrigin currentSubtitleOrigin, + final SubtitleState currentSubtitleState) { + final String videoId = getVideoId(subtitleUrl); + + final String languageCode = resolveSubtitleLanguage( + subtitleUrl, + currentSubtitleOrigin + ); + + final String filename = buildSubtitleCacheFilename(videoId, + languageCode, + currentSubtitleOrigin, + currentSubtitleState, + format.getSuffix()); + + return filename; + } + + public static SubtitleOrigin getSubtitleOrigin(final boolean autoGenerated, + final boolean autoTranslate) { + if (autoTranslate) { + return SubtitleOrigin.AUTO_TRANSLATED; + } + if (autoGenerated) { + return SubtitleOrigin.AUTO_GENERATED; + } + return SubtitleOrigin.HUMAN_PROVIDED; + } + + @Nonnull + private static String buildSubtitleCacheFilename( + @Nonnull final String videoId, + @Nonnull final String language, + @Nonnull final SubtitleOrigin origin, + @Nonnull final SubtitleState state, + @Nonnull final String extension + ) { + final String filenamePartSeparator = "--"; + + return videoId + + filenamePartSeparator + language + + filenamePartSeparator + origin.getId() + + filenamePartSeparator + state.getId() + + "." + extension; + } + + private static String getLanguageCode(final String remoteSubtitleUrl) { + String languageCode = null; + languageCode = YoutubeParsingHelper.extractLanguageCode(remoteSubtitleUrl); + return languageCode; + } + + private static String getAutoTranslateLanguage(final String remoteSubtitleUrl) { + // For auto-translate subtitles Url, there are two language code in it: + // one is 'lang', now its meaning is source language; + // the other is 'tlang', its meaning is target language. + String targetAutoTranslate = null; + targetAutoTranslate = YoutubeParsingHelper.extractTranslationCode( + remoteSubtitleUrl + ); + return targetAutoTranslate; + } + + // For auto-translate subtitles, the cache filename language + // represents the target language (tlang), not the source language. + private static String resolveSubtitleLanguage( + final String subtitleUrl, + final SubtitleOrigin origin + ) { + if (origin == SubtitleOrigin.AUTO_TRANSLATED) { + final String targetLang = getAutoTranslateLanguage(subtitleUrl); + + if (!stringIsNullOrEmpty(targetLang)) { + return targetLang; + } else { + final String unknownLanguage = "unknownLanguage"; + return unknownLanguage; + } + } + + return getLanguageCode(subtitleUrl); + } + + // Extract the videoId (e.g., "b7vmW_5HSpE") from a subtitle URL + // (e.g., .../api/timedtext?v=b7vmW_5HSpE) + // for use in generating unique filenames. + private static String getVideoId(final String remoteSubtitleUrl) { + return YoutubeParsingHelper.extractVideoId(remoteSubtitleUrl); + } + + private static File getCacheFile(final String subtitleUrl, + final MediaFormat format, + final SubtitleOrigin currentSubtitleOrigin, + final SubtitleState currentSubtitleState) { + final String cachefilename = computeFilename(subtitleUrl, + format, + currentSubtitleOrigin, + currentSubtitleState); + + final File cacheFile = new File(cacheDir, cachefilename); + + return cacheFile; + } + + private static File findStoredCacheFile( + final String remoteSubtitleUrl, + final MediaFormat format, + final SubtitleOrigin currentSubtitleOrigin + ) { + for (final SubtitleState state : SubtitleState.values()) { + final File subtitleFile = getCacheFile( + remoteSubtitleUrl, + format, + currentSubtitleOrigin, + state + ); + + if (subtitleFile.exists() && subtitleFile.length() > 0) { + return subtitleFile; + } + } + + return null; + } + + @Nonnull + private static String fallbackToStoredOrRemote( + @Nonnull final String remoteSubtitleUrl, + @Nonnull final MediaFormat format, + @Nonnull final SubtitleOrigin origin + ) { + final File storedFile = findStoredCacheFile( + remoteSubtitleUrl, + format, + origin + ); + + if (storedFile != null) { + final String previousStoredUri = buildLocalFileUri(storedFile); + return previousStoredUri; + } + + return remoteSubtitleUrl; + } + + private static boolean subtitleDownloadFails(final String contentDownloaded) { + return (null == contentDownloaded); + } + + private static boolean subtitleStorageFails(final String localUriAfterStores) { + return (null == localUriAfterStores); + } + + private static boolean ensureItsParentDirExist(final File tempCacheFile) { + final File parentDir = tempCacheFile.getParentFile(); + + if (parentDir.exists()) { + return true; + } else { + final boolean result = parentDir.mkdirs(); + return result; + } + } + + private static String writeDeduplicatedContentToCachefile( + final String subtitleContent, + final File tempCacheFile) { + return writeContentToFile(subtitleContent, tempCacheFile); + } + + private static String writeContentToFile(final String content, + final File tempFile) { + try (BufferedWriter writer = new BufferedWriter(new OutputStreamWriter( + new FileOutputStream(tempFile), StandardCharsets.UTF_8))) { + writer.write(content); + //ok + return null; + } catch (final IOException e) { + final String errorMessage = e.getMessage(); + System.err.println(TAG + ": Failed to write cache file: " + errorMessage); + return errorMessage; + } + } + +} diff --git a/app/src/main/java/org/schabi/newpipe/util/subtitle/SubtitleOrigin.java b/app/src/main/java/org/schabi/newpipe/util/subtitle/SubtitleOrigin.java new file mode 100644 index 00000000000..54b9ba44d54 --- /dev/null +++ b/app/src/main/java/org/schabi/newpipe/util/subtitle/SubtitleOrigin.java @@ -0,0 +1,38 @@ +package org.schabi.newpipe.extractor.utils; + +import javax.annotation.Nonnull; + +/** + * Describes the origin of a subtitle -> how its content was produced. + * + * - Currently, this enum covers YouTube subtitles, but + * it is designed to be extensible for other platforms + * (e.g. Peertube) in the future. + */ +public enum SubtitleOrigin { + + /** + * Currently, YouTube subtitles are categorized + * into three types: + * - Subtitles manually uploaded by the video creator + * (HUMAN_PROVIDED). + * - Subtitles automatically generated by YouTube's + * speech recognition (AUTO_GENERATED). + * - Subtitles automatically translated by YouTube + * from an existing subtitle track (AUTO_TRANSLATED). + */ + HUMAN_PROVIDED("human_provided"), + AUTO_GENERATED("auto_generated"), + AUTO_TRANSLATED("auto_translated"); + + private final String id; + + SubtitleOrigin(@Nonnull final String id) { + this.id = id; + } + + @Nonnull + public String getId() { + return id; + } +} diff --git a/app/src/main/java/org/schabi/newpipe/util/subtitle/SubtitleState.java b/app/src/main/java/org/schabi/newpipe/util/subtitle/SubtitleState.java new file mode 100644 index 00000000000..eb5b360bed5 --- /dev/null +++ b/app/src/main/java/org/schabi/newpipe/util/subtitle/SubtitleState.java @@ -0,0 +1,31 @@ +package org.schabi.newpipe.extractor.utils; + +import javax.annotation.Nonnull; + +/** + * Describes the processing state of a subtitle. + * + * - This enum represents whether the subtitle content + * is original or has been post-processed (e.g. deduplicated). + * - Unlike `enum SubtitleOrigin`, this does not describe + * how the subtitle was created, but how it has been + * processed locally. + */ +public enum SubtitleState { + + // Original subtitle content, no modifications + ORIGINAL("original"), + // Subtitle content after deduplication processing + DEDUPLICATED("deduplicated"); + + private final String id; + + SubtitleState(@Nonnull final String id) { + this.id = id; + } + + @Nonnull + public String getId() { + return id; + } +} From ef2a0a9c496169e9e57a25caa99a087102742871 Mon Sep 17 00:00:00 2001 From: TransZAllen <49811617+TransZAllen@users.noreply.github.com> Date: Wed, 1 Apr 2026 16:10:07 +0800 Subject: [PATCH 02/13] [duplicated subtitle] Fix compilation errors after moving from NewPipeExtractor to NewPipe repository. - Changed `package` and `import` statements to adapt to NewPipe main repository. - Replace `javax.annotation.Nonnull` with `androidx.annotation.NonNull` for compatibility with `androidx`, replacing `javax`. --- .../util/subtitle/SubtitleDeduplicator.java | 24 +++++++++---------- .../newpipe/util/subtitle/SubtitleOrigin.java | 8 +++---- .../newpipe/util/subtitle/SubtitleState.java | 8 +++---- 3 files changed, 20 insertions(+), 20 deletions(-) diff --git a/app/src/main/java/org/schabi/newpipe/util/subtitle/SubtitleDeduplicator.java b/app/src/main/java/org/schabi/newpipe/util/subtitle/SubtitleDeduplicator.java index 70ecb147e93..c7ae7cf24bc 100644 --- a/app/src/main/java/org/schabi/newpipe/util/subtitle/SubtitleDeduplicator.java +++ b/app/src/main/java/org/schabi/newpipe/util/subtitle/SubtitleDeduplicator.java @@ -1,4 +1,4 @@ -package org.schabi.newpipe.extractor.utils; +package org.schabi.newpipe.util.subtitle; import java.io.BufferedReader; import java.io.File; @@ -17,7 +17,7 @@ import java.util.regex.Matcher; import java.util.regex.Pattern; -import javax.annotation.Nonnull; +import androidx.annotation.NonNull; import org.schabi.newpipe.extractor.services.youtube.YoutubeParsingHelper; import org.schabi.newpipe.extractor.MediaFormat; @@ -430,13 +430,13 @@ public static SubtitleOrigin getSubtitleOrigin(final boolean autoGenerated, return SubtitleOrigin.HUMAN_PROVIDED; } - @Nonnull + @NonNull private static String buildSubtitleCacheFilename( - @Nonnull final String videoId, - @Nonnull final String language, - @Nonnull final SubtitleOrigin origin, - @Nonnull final SubtitleState state, - @Nonnull final String extension + @NonNull final String videoId, + @NonNull final String language, + @NonNull final SubtitleOrigin origin, + @NonNull final SubtitleState state, + @NonNull final String extension ) { final String filenamePartSeparator = "--"; @@ -526,11 +526,11 @@ private static File findStoredCacheFile( return null; } - @Nonnull + @NonNull private static String fallbackToStoredOrRemote( - @Nonnull final String remoteSubtitleUrl, - @Nonnull final MediaFormat format, - @Nonnull final SubtitleOrigin origin + @NonNull final String remoteSubtitleUrl, + @NonNull final MediaFormat format, + @NonNull final SubtitleOrigin origin ) { final File storedFile = findStoredCacheFile( remoteSubtitleUrl, diff --git a/app/src/main/java/org/schabi/newpipe/util/subtitle/SubtitleOrigin.java b/app/src/main/java/org/schabi/newpipe/util/subtitle/SubtitleOrigin.java index 54b9ba44d54..12fe7a7fe8f 100644 --- a/app/src/main/java/org/schabi/newpipe/util/subtitle/SubtitleOrigin.java +++ b/app/src/main/java/org/schabi/newpipe/util/subtitle/SubtitleOrigin.java @@ -1,6 +1,6 @@ -package org.schabi.newpipe.extractor.utils; +package org.schabi.newpipe.util.subtitle; -import javax.annotation.Nonnull; +import androidx.annotation.NonNull; /** * Describes the origin of a subtitle -> how its content was produced. @@ -27,11 +27,11 @@ public enum SubtitleOrigin { private final String id; - SubtitleOrigin(@Nonnull final String id) { + SubtitleOrigin(@NonNull final String id) { this.id = id; } - @Nonnull + @NonNull public String getId() { return id; } diff --git a/app/src/main/java/org/schabi/newpipe/util/subtitle/SubtitleState.java b/app/src/main/java/org/schabi/newpipe/util/subtitle/SubtitleState.java index eb5b360bed5..029ccc8fd3d 100644 --- a/app/src/main/java/org/schabi/newpipe/util/subtitle/SubtitleState.java +++ b/app/src/main/java/org/schabi/newpipe/util/subtitle/SubtitleState.java @@ -1,6 +1,6 @@ -package org.schabi.newpipe.extractor.utils; +package org.schabi.newpipe.util.subtitle; -import javax.annotation.Nonnull; +import androidx.annotation.NonNull; /** * Describes the processing state of a subtitle. @@ -20,11 +20,11 @@ public enum SubtitleState { private final String id; - SubtitleState(@Nonnull final String id) { + SubtitleState(@NonNull final String id) { this.id = id; } - @Nonnull + @NonNull public String getId() { return id; } From 41e158cd9ba3c0ad8fc3d900fc67eba1024a967d Mon Sep 17 00:00:00 2001 From: TransZAllen <49811617+TransZAllen@users.noreply.github.com> Date: Wed, 1 Apr 2026 16:10:37 +0800 Subject: [PATCH 03/13] [duplicated subtitle] Replace System.err.println with Android Log in app-side code. --- .../util/subtitle/SubtitleDeduplicator.java | 28 ++++++++----------- 1 file changed, 12 insertions(+), 16 deletions(-) diff --git a/app/src/main/java/org/schabi/newpipe/util/subtitle/SubtitleDeduplicator.java b/app/src/main/java/org/schabi/newpipe/util/subtitle/SubtitleDeduplicator.java index c7ae7cf24bc..1361da5fadd 100644 --- a/app/src/main/java/org/schabi/newpipe/util/subtitle/SubtitleDeduplicator.java +++ b/app/src/main/java/org/schabi/newpipe/util/subtitle/SubtitleDeduplicator.java @@ -19,6 +19,8 @@ import androidx.annotation.NonNull; +import android.util.Log; + import org.schabi.newpipe.extractor.services.youtube.YoutubeParsingHelper; import org.schabi.newpipe.extractor.MediaFormat; import org.schabi.newpipe.extractor.NewPipe; @@ -154,7 +156,7 @@ private static void printCacheDirNotInitialized() { + "Fallback to original subtitle without deduplication. " + "setCacheDirPath() should be called before using this class."; - System.err.println(TAG + ": " + errorMessage); + Log.w(TAG, errorMessage); } private static String downloadRemoteSubtitleContent(final String urlStr, @@ -163,7 +165,7 @@ private static String downloadRemoteSubtitleContent(final String urlStr, final int initialDelayMillis) { final Downloader downloader = NewPipe.getDownloader(); if (downloader == null) { - System.err.println(TAG + ": Downloader not initialized"); + Log.w(TAG, "Downloader not initialized- cannot download subtitles"); return null; } // if auto-translate language subtitle, use the bigger data. @@ -177,18 +179,14 @@ private static String downloadRemoteSubtitleContent(final String urlStr, if (response.responseCode() == 200) { return response.responseBody(); } else { - System.err.println(TAG + ": Attempt " + attempt - + " failed with status: " - + response.responseCode() - + " URL: " + urlStr); + Log.w(TAG, "Attempt " + attempt + " failed with status: " + + response.responseCode() + " URL: " + urlStr); if (response.responseCode() != 503 && response.responseCode() != 429) { return null; } } } catch (IOException | ReCaptchaException e) { - System.err.println(TAG + ": Attempt " + attempt - + " failed: " + e.getMessage() - + " URL: " + urlStr); + Log.w(TAG, "Attempt " + attempt + " failed for URL: " + urlStr, e); } if (attempt < maxRetries) { try { @@ -200,8 +198,8 @@ private static String downloadRemoteSubtitleContent(final String urlStr, } } } - System.err.println(TAG + ": Failed to download subtitle after " - + maxRetries + " URL: " + urlStr); + Log.e(TAG, "Failed to download subtitle after " + maxRetries + + " attempts. URL: " + urlStr); return null; } @@ -392,8 +390,7 @@ private static String storeItToCacheDir(final String subtitleContent, if (null == writeDeduplicatedContentToCachefile(subtitleContent, cacheFile)) { return cacheFilePathForExoplayer; } else { - System.err.println(TAG + ": Failed to write cache file: " - + cacheFile.getAbsolutePath()); + Log.e(TAG, "Failed to write cache file: " + cacheFile.getAbsolutePath()); return null; } } @@ -579,9 +576,8 @@ private static String writeContentToFile(final String content, //ok return null; } catch (final IOException e) { - final String errorMessage = e.getMessage(); - System.err.println(TAG + ": Failed to write cache file: " + errorMessage); - return errorMessage; + Log.e(TAG, "Failed to write cache file", e); + return e.getMessage(); } } From d3f794587ea5d031ffa0f17f8547cad98c6f859e Mon Sep 17 00:00:00 2001 From: TransZAllen <49811617+TransZAllen@users.noreply.github.com> Date: Wed, 1 Apr 2026 16:11:07 +0800 Subject: [PATCH 04/13] [duplicated subtitle][YouTube] Restrict `SubtitleDeduplicator` to YouTube-related URLs. - `SubtitleDeduplicator` relies on YouTube-specific subtitle URL semantics (videoId, languageCode, translationCode) for cache file naming and deduplication. - Add `isYoutubeRelatedUrl()` to ensure deduplication logic is only applied to YouTube URLs. For non-YouTube subtitle URLs, the original subtitle URL is returned unchanged. --- .../util/subtitle/SubtitleDeduplicator.java | 20 +++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/app/src/main/java/org/schabi/newpipe/util/subtitle/SubtitleDeduplicator.java b/app/src/main/java/org/schabi/newpipe/util/subtitle/SubtitleDeduplicator.java index 1361da5fadd..a0544411a3c 100644 --- a/app/src/main/java/org/schabi/newpipe/util/subtitle/SubtitleDeduplicator.java +++ b/app/src/main/java/org/schabi/newpipe/util/subtitle/SubtitleDeduplicator.java @@ -17,6 +17,9 @@ import java.util.regex.Matcher; import java.util.regex.Pattern; +import java.net.MalformedURLException; +import java.net.URL; + import androidx.annotation.NonNull; import android.util.Log; @@ -75,6 +78,13 @@ public static void setCacheDirPath(final String path) { public static String checkAndDeduplicate(final String remoteSubtitleUrl, final MediaFormat format, final SubtitleOrigin currentSubtitleOrigin) { + // Subtitle deduplication relies on YouTube-specific subtitle URL semantics + // (videoId, languageCode, translationCode) which are used for cache file naming. + // For non-YouTube URLs, the original subtitle is returned unchanged. + if (!isYoutubeRelatedUrl(remoteSubtitleUrl)) { + return remoteSubtitleUrl; + } + if (!isCacheDirAvailable()) { printCacheDirNotInitialized(); return remoteSubtitleUrl; @@ -488,6 +498,16 @@ private static String getVideoId(final String remoteSubtitleUrl) { return YoutubeParsingHelper.extractVideoId(remoteSubtitleUrl); } + private static boolean isYoutubeRelatedUrl(@NonNull final String url) { + try { + final URL parsedUrl = new URL(url); + return (YoutubeParsingHelper.isYoutubeURL(parsedUrl) + || YoutubeParsingHelper.isYoutubeServiceURL(parsedUrl)); + } catch (final MalformedURLException e) { + return false; + } + } + private static File getCacheFile(final String subtitleUrl, final MediaFormat format, final SubtitleOrigin currentSubtitleOrigin, From 5981a90762e5b3e963a753df6897ad76ac038cd5 Mon Sep 17 00:00:00 2001 From: TransZAllen <49811617+TransZAllen@users.noreply.github.com> Date: Wed, 1 Apr 2026 16:11:37 +0800 Subject: [PATCH 05/13] [duplicated subtitle] Introduce `CacheDirUtils` for app cache directory selection. - This commit introduces `CacheDirUtils` to centralize application cache directory selection logic. - The preferred cache directory path is now initialized in 'App.onCreate()' and passed to `SubtitleDeduplicator`, instead of relying on 'StateSaver.init().' --- app/src/main/java/org/schabi/newpipe/App.kt | 6 ++ .../schabi/newpipe/util/CacheDirUtils.java | 66 +++++++++++++++++++ 2 files changed, 72 insertions(+) create mode 100644 app/src/main/java/org/schabi/newpipe/util/CacheDirUtils.java diff --git a/app/src/main/java/org/schabi/newpipe/App.kt b/app/src/main/java/org/schabi/newpipe/App.kt index 3ca259528ac..dd1561f36c8 100644 --- a/app/src/main/java/org/schabi/newpipe/App.kt +++ b/app/src/main/java/org/schabi/newpipe/App.kt @@ -34,12 +34,14 @@ import org.schabi.newpipe.extractor.services.youtube.extractors.YoutubeStreamExt import org.schabi.newpipe.ktx.hasAssignableCause import org.schabi.newpipe.settings.NewPipeSettings import org.schabi.newpipe.util.BridgeStateSaverInitializer +import org.schabi.newpipe.util.CacheDirUtils import org.schabi.newpipe.util.Localization import org.schabi.newpipe.util.ServiceHelper import org.schabi.newpipe.util.StateSaver import org.schabi.newpipe.util.image.ImageStrategy import org.schabi.newpipe.util.image.PreferredImageQuality import org.schabi.newpipe.util.potoken.PoTokenProviderImpl +import org.schabi.newpipe.util.subtitle.SubtitleDeduplicator /* * Copyright (C) Hans-Christoph Steiner 2016 @@ -93,6 +95,8 @@ open class App : .getInt(getString(R.string.last_used_preferences_version), -1) isFirstRun = lastUsedPrefVersion == -1 + val appCacheDirPath = CacheDirUtils.getPreferredAppCacheDirPath(this) + // Initialize settings first because other initializations can use its values NewPipeSettings.initSettings(this) @@ -124,6 +128,8 @@ open class App : configureRxJavaErrorHandler() YoutubeStreamExtractor.setPoTokenProvider(PoTokenProviderImpl) + + SubtitleDeduplicator.setCacheDirPath(appCacheDirPath) } override fun newImageLoader(context: Context): ImageLoader = ImageLoader diff --git a/app/src/main/java/org/schabi/newpipe/util/CacheDirUtils.java b/app/src/main/java/org/schabi/newpipe/util/CacheDirUtils.java new file mode 100644 index 00000000000..dc6a0978422 --- /dev/null +++ b/app/src/main/java/org/schabi/newpipe/util/CacheDirUtils.java @@ -0,0 +1,66 @@ +package org.schabi.newpipe.util; + +import java.io.File; + +import android.content.Context; +import androidx.annotation.NonNull; + +public final class CacheDirUtils { + + private CacheDirUtils() { + // no instance + } + + public static String getExternalAppCacheDirPath( + @NonNull final Context context) { + final File externalCacheDir = context.getExternalCacheDir(); + if (null != externalCacheDir) { + // /storage/emulated/0/Android/data//cache/ + return externalCacheDir.getAbsolutePath(); + } + + return null; + } + + public static String getInternalAppCacheDirPath( + @NonNull final Context context) { + // always available, never be 'null' + // /data/user/0//cache/ + return context.getCacheDir().getAbsolutePath(); + } + + /** + * Returns the preferred cache directory path for the application. + * + * Prefers the external cache directory when available + * (user-accessible, larger space), + * falls back to the internal private cache directory otherwise + * (always available, more secure). + * + * Typical paths: + * - External: /storage/emulated/0/Android/data//cache/ + * - Internal: /data/user/0//cache/ + * (or /data/data//cache/ on some devices) + * + * Note: The 'external' and 'internal' cache directories mentioned above + * are Android terms. They are typically located on the device's + * built-in storage and are not related to removable SD/TF cards. + * + * User "Clear Cache" in app settings deletes files in both locations. + * + * @param context used to get the available cache dir + * @return absolute path string, never null + */ + @NonNull + public static String getPreferredAppCacheDirPath( + @NonNull final Context context) { + + final String externalCacheDirPath = getExternalAppCacheDirPath(context); + if (null != externalCacheDirPath) { + return externalCacheDirPath; + } + + // Internal cache dir should always be available + return getInternalAppCacheDirPath(context); + } +} From 5f53c68cb86eecec587a8fd710a86b3a363d1982 Mon Sep 17 00:00:00 2001 From: TransZAllen <49811617+TransZAllen@users.noreply.github.com> Date: Wed, 1 Apr 2026 16:12:07 +0800 Subject: [PATCH 06/13] [duplicated subtitle][unit test] Move `SubtitleDeduplicatorTest` from NewPipeExtractor to app side. Add unit tests for `SubtitleDeduplicator` in `SubtitleDeduplicatorTest.java`. --- .../subtitle/SubtitleDeduplicatorTest.java | 98 +++++++++++++++++++ 1 file changed, 98 insertions(+) create mode 100644 app/src/test/java/org/schabi/newpipe/util/subtitle/SubtitleDeduplicatorTest.java diff --git a/app/src/test/java/org/schabi/newpipe/util/subtitle/SubtitleDeduplicatorTest.java b/app/src/test/java/org/schabi/newpipe/util/subtitle/SubtitleDeduplicatorTest.java new file mode 100644 index 00000000000..bbf2faef45c --- /dev/null +++ b/app/src/test/java/org/schabi/newpipe/util/subtitle/SubtitleDeduplicatorTest.java @@ -0,0 +1,98 @@ +package org.schabi.newpipe.extractor.utils; + +import org.junit.jupiter.api.Test; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; +import static org.junit.jupiter.api.Assertions.assertFalse; + +public class SubtitleDeduplicatorTest { + + @Test + public void deduplicate_exactDuplicateEntries_shouldRemoveDuplicate() { + String input = + "

Hello

\n" + + "

Hello

"; + + String output = SubtitleDeduplicator.deduplicateContent(input); + + String expected = + "

Hello

"; + + // The `strip()` method is used here to remove the trailing + // newline character (\n, outside of

tags) at the end of the `output`. + // Removing this (\n) does not affect the TTML subtitle paragraphs, + // as only the content within

tags is considered valid for subtitles. + assertEquals(expected, output.strip()); + } + + @Test + public void deduplicate_sameTimeDifferentText_shouldNotDeduplicate() { + String input = + "

Hello

\n" + + "

World

"; + + String output = SubtitleDeduplicator.deduplicateContent(input); + + String expected = input; + + assertEquals(expected, output); + } + + @Test + public void deduplicate_sameTextDifferentTime_shouldNotDeduplicate() { + String input = + "

Hello

\n" + + "

Hello

"; + + String output = SubtitleDeduplicator.deduplicateContent(input); + + String expected = input; + + assertEquals(expected, output); + } + + @Test + public void containsDuplicatedEntries_exactDuplicate_shouldReturnTrue() { + String input = + "

Hello

\n" + + "

Hello

"; + + assertTrue(SubtitleDeduplicator.containsDuplicatedEntries(input)); + } + + @Test + public void containsDuplicatedEntries_noDuplicate_shouldReturnFalse() { + String input = + "

Hello

\n" + + "

World

"; + + assertFalse(SubtitleDeduplicator.containsDuplicatedEntries(input)); + } + + @Test + public void containsDuplicatedEntries_normalizeLeadingAndTrailingWhitespace_shouldConsiderAsSame() { + // Note: + // This test verifies that the deduplication logic normalizes + // leading and trailing whitespace, and considers the content + // as the same after this normalization, without modifying + // the original subtitle content. + String input = + "

Hello world

\n" + + "

Hello world

"; + assertTrue(SubtitleDeduplicator.containsDuplicatedEntries(input)); + } + + @Test + public void containsDuplicatedEntries_normalizeMultipleSpaces_shouldConsiderAsSingleSpace() { + // Note: + // This test verifies that the deduplication logic normalizes + // multiple consecutive spaces into a single space, + // considering the content as the same after this normalization, + // without modifying the original subtitle content. + String input = + "

Hello world

\n" + + "

Hello world

"; + assertTrue(SubtitleDeduplicator.containsDuplicatedEntries(input)); + } +} From 007c501ab12fbecb890ec611bf2a9eb2e45bdc6d Mon Sep 17 00:00:00 2001 From: TransZAllen <49811617+TransZAllen@users.noreply.github.com> Date: Wed, 1 Apr 2026 16:12:37 +0800 Subject: [PATCH 07/13] [duplicated subtitle][unit test] Fix `./gradlew test --no-configuration-cache` errors after moving from NewPipeExtractor to NewPipe repository. - "error: static import only from classes and interfaces" - Changed `package` and `import` statements to adapt to NewPipe main repository. - Name 'containsDuplicatedEntries_exactDuplicate_shouldReturnTrue' must match pattern '^[a-z][a-zA-Z0-9]*$'. - Variable 'expected' should be declared final. - '+' should be on a new line. - Line is longer than 100 characters --- .../subtitle/SubtitleDeduplicatorTest.java | 78 +++++++++---------- 1 file changed, 39 insertions(+), 39 deletions(-) diff --git a/app/src/test/java/org/schabi/newpipe/util/subtitle/SubtitleDeduplicatorTest.java b/app/src/test/java/org/schabi/newpipe/util/subtitle/SubtitleDeduplicatorTest.java index bbf2faef45c..4f47900dfa4 100644 --- a/app/src/test/java/org/schabi/newpipe/util/subtitle/SubtitleDeduplicatorTest.java +++ b/app/src/test/java/org/schabi/newpipe/util/subtitle/SubtitleDeduplicatorTest.java @@ -1,22 +1,22 @@ -package org.schabi.newpipe.extractor.utils; +package org.schabi.newpipe.util.subtitle; -import org.junit.jupiter.api.Test; +import org.junit.Test; -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertTrue; -import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.assertFalse; public class SubtitleDeduplicatorTest { @Test - public void deduplicate_exactDuplicateEntries_shouldRemoveDuplicate() { - String input = - "

Hello

\n" + - "

Hello

"; + public void deduplicateExactDuplicateEntriesShouldRemoveDuplicate() { + final String input = + "

Hello

\n" + + "

Hello

"; - String output = SubtitleDeduplicator.deduplicateContent(input); + final String output = SubtitleDeduplicator.deduplicateContent(input); - String expected = + final String expected = "

Hello

"; // The `strip()` method is used here to remove the trailing @@ -27,72 +27,72 @@ public void deduplicate_exactDuplicateEntries_shouldRemoveDuplicate() { } @Test - public void deduplicate_sameTimeDifferentText_shouldNotDeduplicate() { - String input = - "

Hello

\n" + - "

World

"; + public void deduplicateSameTimeDifferentTextShouldNotDeduplicate() { + final String input = + "

Hello

\n" + + "

World

"; - String output = SubtitleDeduplicator.deduplicateContent(input); + final String output = SubtitleDeduplicator.deduplicateContent(input); - String expected = input; + final String expected = input; assertEquals(expected, output); } @Test - public void deduplicate_sameTextDifferentTime_shouldNotDeduplicate() { - String input = - "

Hello

\n" + - "

Hello

"; + public void deduplicateSameTextDifferentTimeShouldNotDeduplicate() { + final String input = + "

Hello

\n" + + "

Hello

"; - String output = SubtitleDeduplicator.deduplicateContent(input); + final String output = SubtitleDeduplicator.deduplicateContent(input); - String expected = input; + final String expected = input; assertEquals(expected, output); } @Test - public void containsDuplicatedEntries_exactDuplicate_shouldReturnTrue() { - String input = - "

Hello

\n" + - "

Hello

"; + public void containsDuplicatedEntriesExactDuplicateShouldReturnTrue() { + final String input = + "

Hello

\n" + + "

Hello

"; assertTrue(SubtitleDeduplicator.containsDuplicatedEntries(input)); } @Test - public void containsDuplicatedEntries_noDuplicate_shouldReturnFalse() { - String input = - "

Hello

\n" + - "

World

"; + public void containsDuplicatedEntriesNoDuplicateShouldReturnFalse() { + final String input = + "

Hello

\n" + + "

World

"; assertFalse(SubtitleDeduplicator.containsDuplicatedEntries(input)); } @Test - public void containsDuplicatedEntries_normalizeLeadingAndTrailingWhitespace_shouldConsiderAsSame() { + public void containsDuplicatesNormalizeLeadingAndTrailingWhitespaceShouldConsiderAsSame() { // Note: // This test verifies that the deduplication logic normalizes // leading and trailing whitespace, and considers the content // as the same after this normalization, without modifying // the original subtitle content. - String input = - "

Hello world

\n" + - "

Hello world

"; + final String input = + "

Hello world

\n" + + "

Hello world

"; assertTrue(SubtitleDeduplicator.containsDuplicatedEntries(input)); } @Test - public void containsDuplicatedEntries_normalizeMultipleSpaces_shouldConsiderAsSingleSpace() { + public void containsDuplicatedEntriesNormalizeMultipleSpacesShouldConsiderAsSingleSpace() { // Note: // This test verifies that the deduplication logic normalizes // multiple consecutive spaces into a single space, // considering the content as the same after this normalization, // without modifying the original subtitle content. - String input = - "

Hello world

\n" + - "

Hello world

"; + final String input = + "

Hello world

\n" + + "

Hello world

"; assertTrue(SubtitleDeduplicator.containsDuplicatedEntries(input)); } } From 0fadb573d98676a913c42eac6e8d2b9aaa481b08 Mon Sep 17 00:00:00 2001 From: TransZAllen <49811617+TransZAllen@users.noreply.github.com> Date: Wed, 1 Apr 2026 16:13:07 +0800 Subject: [PATCH 08/13] [duplicated subtitle][YouTube] Introduce AppStreamInfo to normalize subtitles in app layer. Introduce a new domain class `AppStreamInfo` to perform subtitle normalization (deduplication) on the application side without modifying the extractor data `StreamInfo`. Previously subtitle deduplication logic modified `StreamInfo` directly, which mixes application concerns with extractor data structures. This change separates responsibilities by projecting extractor `StreamInfo` into an app-level domain object. Key points: - Preserve original `StreamInfo` from the extractor unchanged - Perform subtitle deduplication once when constructing `AppStreamInfo` - Provide normalized subtitle list for player and download usage - Ensure subtitle normalization logic is centralized and reusable - `from()` only perform object creation - subtitle deduplication (which requires network downloading) is in the loadNormalizedSubtitles(). --- .../schabi/newpipe/streams/AppStreamInfo.java | 109 ++++++++++++++++++ 1 file changed, 109 insertions(+) create mode 100644 app/src/main/java/org/schabi/newpipe/streams/AppStreamInfo.java diff --git a/app/src/main/java/org/schabi/newpipe/streams/AppStreamInfo.java b/app/src/main/java/org/schabi/newpipe/streams/AppStreamInfo.java new file mode 100644 index 00000000000..9cf1df7a18d --- /dev/null +++ b/app/src/main/java/org/schabi/newpipe/streams/AppStreamInfo.java @@ -0,0 +1,109 @@ +package org.schabi.newpipe.streams; + +import androidx.annotation.NonNull; +import androidx.annotation.Nullable; + +import android.util.Log; + +import org.schabi.newpipe.extractor.stream.StreamInfo; +import org.schabi.newpipe.extractor.stream.SubtitlesStream; +import org.schabi.newpipe.extractor.MediaFormat; +import org.schabi.newpipe.util.subtitle.SubtitleDeduplicator; +import org.schabi.newpipe.util.subtitle.SubtitleOrigin; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +public final class AppStreamInfo { + private static final String TAG = AppStreamInfo.class.getSimpleName(); + + @NonNull + private final StreamInfo originalStreamInfo; + @Nullable + private List normalizedSubtitles = null; + + private AppStreamInfo(@NonNull final StreamInfo original) { + this.originalStreamInfo = original; + } + + // Factory method: build AppStreamInfo from raw StreamInfo. + @NonNull + public static AppStreamInfo from(@NonNull final StreamInfo info) { + return new AppStreamInfo(info); + } + + @NonNull + public StreamInfo getOriginal() { + return originalStreamInfo; + } + + @NonNull + public List loadNormalizedSubtitles() { + if (null == normalizedSubtitles) { + final List originalSubtitles = + originalStreamInfo.getSubtitles(); + + normalizedSubtitles = + deduplicateSubtitles(originalSubtitles); + } + + return normalizedSubtitles; + } + + @NonNull + private static List deduplicateSubtitles( + @Nullable final List originalSubtitles) { + + if ((null == originalSubtitles) || originalSubtitles.isEmpty()) { + return Collections.emptyList(); + } + + final List newSubtitles = new ArrayList<>(); + + for (final SubtitlesStream oldSubtitle : originalSubtitles) { + final MediaFormat format = oldSubtitle.getFormat(); + if (null == format) { + newSubtitles.add(oldSubtitle); + continue; + } + + try { + final SubtitleOrigin origin = + SubtitleDeduplicator.getSubtitleOrigin( + oldSubtitle.isAutoGenerated(), + false + ); + + final String remoteSubtitleUrl = oldSubtitle.getContent(); + + final String subtitleUrl = + SubtitleDeduplicator.checkAndDeduplicate( + remoteSubtitleUrl, + format, + origin + ); + + if (remoteSubtitleUrl.equals(subtitleUrl)) { + newSubtitles.add(oldSubtitle); + } else { + final SubtitlesStream oneNewSubtitle = + new SubtitlesStream.Builder() + .setContent(subtitleUrl, true) + .setMediaFormat(format) + .setLanguageCode(oldSubtitle.getLanguageTag()) + .setAutoGenerated(oldSubtitle.isAutoGenerated()) + .build(); + + newSubtitles.add(oneNewSubtitle); + } + } catch (final Exception e) { + Log.w(TAG, "Subtitle deduplication failed", e); + newSubtitles.add(oldSubtitle); + } + } + + return newSubtitles; + } + +} From 52e2808e2dee17b975d5fbe9ab4d1cfa3b581d8c Mon Sep 17 00:00:00 2001 From: TransZAllen <49811617+TransZAllen@users.noreply.github.com> Date: Wed, 1 Apr 2026 16:13:37 +0800 Subject: [PATCH 09/13] [duplicated subtitle] Fix the issue of duplicated subtitles on the screen display (with ExoPlayer module). - Replace StreamInfo.getSubtitles() with AppStreamInfo.loadNormalizedSubtitles() to download TTML subtitles and deduplicate them. - Note: each module calling subtitle normalization will perform network download independently. AppStreamInfo cannot be shared across modules like StreamInfo. --- .../schabi/newpipe/player/resolver/VideoPlaybackResolver.java | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/app/src/main/java/org/schabi/newpipe/player/resolver/VideoPlaybackResolver.java b/app/src/main/java/org/schabi/newpipe/player/resolver/VideoPlaybackResolver.java index 670c13934df..897b1605c11 100644 --- a/app/src/main/java/org/schabi/newpipe/player/resolver/VideoPlaybackResolver.java +++ b/app/src/main/java/org/schabi/newpipe/player/resolver/VideoPlaybackResolver.java @@ -21,6 +21,7 @@ import org.schabi.newpipe.player.helper.PlayerHelper; import org.schabi.newpipe.player.mediaitem.MediaItemTag; import org.schabi.newpipe.player.mediaitem.StreamInfoTag; +import org.schabi.newpipe.streams.AppStreamInfo; import org.schabi.newpipe.util.ListHelper; import java.util.ArrayList; @@ -136,7 +137,8 @@ public MediaSource resolve(@NonNull final StreamInfo info) { // Below are auxiliary media sources // Create subtitle sources - final List subtitlesStreams = info.getSubtitles(); + final AppStreamInfo appInfo = AppStreamInfo.from(info); + final List subtitlesStreams = appInfo.loadNormalizedSubtitles(); if (subtitlesStreams != null) { // Torrent and non URL subtitles are not supported by ExoPlayer final List nonTorrentAndUrlStreams = getUrlAndNonTorrentStreams( From 6818991df8cb28419503365a12978e7b011d5798 Mon Sep 17 00:00:00 2001 From: TransZAllen <49811617+TransZAllen@users.noreply.github.com> Date: Wed, 1 Apr 2026 16:14:07 +0800 Subject: [PATCH 10/13] [duplicated subtitle] Support subtitle deduplication for manual SRT downloads - After remote subtitles (TTML format) are downloaded, the subtitle content is processed by SubtitleDeduplicator to remove duplicated segments. The cleaned content is then passed to SrtFromTtmlWriter to generate the final SRT subtitle file without duplicated entries. - This logic is platform-independent and does not distinguish whether the video source is YouTube or another platform. - A minimal ByteArraySharpStream implementation is used to adapt the deduplicated byte content back into the SharpStream interface without modifying existing stream APIs. - Add comment explaining why `> 0` is used when reading SharpStream. --- .../giga/postprocessing/TtmlConverter.java | 118 +++++++++++++++++- 1 file changed, 115 insertions(+), 3 deletions(-) diff --git a/app/src/main/java/us/shandian/giga/postprocessing/TtmlConverter.java b/app/src/main/java/us/shandian/giga/postprocessing/TtmlConverter.java index d723bfb4561..05f81d08d72 100644 --- a/app/src/main/java/us/shandian/giga/postprocessing/TtmlConverter.java +++ b/app/src/main/java/us/shandian/giga/postprocessing/TtmlConverter.java @@ -4,8 +4,12 @@ import org.schabi.newpipe.streams.SrtFromTtmlWriter; import org.schabi.newpipe.streams.io.SharpStream; +import org.schabi.newpipe.util.subtitle.SubtitleDeduplicator; +import java.nio.charset.StandardCharsets; import java.io.IOException; +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; /** * @author kapodamy @@ -23,12 +27,17 @@ int process(SharpStream out, SharpStream... sources) throws IOException { // check if the subtitle is already in srt and copy, this should never happen String format = getArgumentAt(0, null); boolean ignoreEmptyFrames = getArgumentAt(1, "true").equals("true"); - if (format == null || format.equals("ttml")) { SrtFromTtmlWriter writer = new SrtFromTtmlWriter(out, ignoreEmptyFrames); - try { - writer.build(sources[0]); + final String subtitleContent = + readSharpStreamToString(sources[0]); + final String deduplicated = + SubtitleDeduplicator.deduplicateContent(subtitleContent); + final SharpStream stream = + new ByteArraySharpStream( + deduplicated.getBytes(StandardCharsets.UTF_8)); + writer.build(stream); } catch (IOException err) { Log.e(TAG, "subtitle conversion failed due to I/O error", err); throw err; @@ -50,4 +59,107 @@ int process(SharpStream out, SharpStream... sources) throws IOException { throw new UnsupportedOperationException("Can't convert this subtitle, unimplemented format: " + format); } + private static String readSharpStreamToString(final SharpStream stream) throws IOException { + + final ByteArrayOutputStream out = new ByteArrayOutputStream(); + final byte[] buffer = new byte[8192]; + + int read; + + // Note: `> 0` is required here because ChunkFileInputStream.read() + // returns 0 at EOF instead of -1. Using `!= -1` would result in + // an infinite loop in that case. + // + // Standard Java InputStream.read() returns -1 at EOF. + // + // Reference implementation: + // - ChunkFileInputStream.java + // + // Future note: + // - If ChunkFileInputStream changes to return -1 at EOF, this loop + // can safely be switched back to `read != -1`. Keeping `> 0` is + // also safe and will continue to work. + while ((read = stream.read(buffer)) > 0) { + out.write(buffer, 0, read); + } + + final String result = out.toString(StandardCharsets.UTF_8); + + return result; + } + + /** + * Minimal SharpStream backed by a byte array. + */ + private static final class ByteArraySharpStream extends SharpStream { + private final ByteArrayInputStream in; + + ByteArraySharpStream(byte[] data) { + this.in = new ByteArrayInputStream(data); + } + + @Override + public int read() { + return in.read(); + } + + @Override + public int read(byte[] buffer) { + return in.read(buffer, 0, buffer.length); + } + + @Override + public int read(byte[] buffer, int offset, int count) { + return in.read(buffer, offset, count); + } + + @Override + public long skip(long amount) { + return in.skip(amount); + } + + @Override + public long available() { + return in.available(); + } + + @Override + public void rewind() { + in.reset(); + } + + @Override + public boolean isClosed() { + return false; + } + + @Override + public void close() {} + + @Override + public boolean canRewind() { return true; } + + @Override + public boolean canRead() { return true; } + + @Override + public boolean canWrite() { return false; } + + @Override + public void write(byte value) throws IOException { + // This stream is read-only + // and used only for reading subtitle data. + throw new IOException("Stream is read-only"); + } + + @Override + public void write(byte[] buffer) throws IOException { + throw new IOException("Stream is read-only"); + } + + @Override + public void write(byte[] buffer, int offset, int count) throws IOException { + throw new IOException("Stream is read-only"); + } + } } From 8d119f519450cd15e3c3457fd7428062489308e9 Mon Sep 17 00:00:00 2001 From: TransZAllen <49811617+TransZAllen@users.noreply.github.com> Date: Wed, 1 Apr 2026 16:14:37 +0800 Subject: [PATCH 11/13] [duplicated subtitle] Add methods in SubtitleDeduplicator to ignore style tags and normalize subtitle text content - Helps handle YouTube subtitles that have different style attributes but the same text and timestamps - Add SUPPORT_STYLED_SUBTITLE_RENDERING flag for future styled subtitle support (currently not supported in NewPipe) - Remove invisible Unicode characters (zero-width and directionality controls) - Handle non-breaking spaces, BOM (U+FEFF), multiple spaces, and leading/trailing spaces - This commit is tested with: https://www.youtube.com/watch?v=7w3jBGX7UcY --- .../util/subtitle/SubtitleDeduplicator.java | 93 ++++++++++++++++--- 1 file changed, 82 insertions(+), 11 deletions(-) diff --git a/app/src/main/java/org/schabi/newpipe/util/subtitle/SubtitleDeduplicator.java b/app/src/main/java/org/schabi/newpipe/util/subtitle/SubtitleDeduplicator.java index a0544411a3c..fd644dc6fab 100644 --- a/app/src/main/java/org/schabi/newpipe/util/subtitle/SubtitleDeduplicator.java +++ b/app/src/main/java/org/schabi/newpipe/util/subtitle/SubtitleDeduplicator.java @@ -53,6 +53,12 @@ public final class SubtitleDeduplicator { private static final float BACKOFF_FACTOR = 1.0f; + // Once NewPipe/ExoPlayer supports styled subtitle rendering + // (e.g., colors, bold, background), set this to 'true' + // to preserve different styles for the same subtitle text + // in consecutive subtitle entries. + private static final boolean SUPPORT_STYLED_SUBTITLE_RENDERING = false; + private static String subCacheDir = "subtitle_cache"; private static File cacheDir = null; @@ -250,6 +256,7 @@ public static boolean containsDuplicateTtmlEntries(final File subtitleFile) { // Detects whether the subtitle contains duplicated

entries // using the same normalized (whitespace-trimmed) comparison rules // as deduplicateContent(). + // Note: entry == paragraph public static boolean containsDuplicatedEntries(final String subtitleContent) { if (stringIsNullOrEmpty(subtitleContent)) { return false; @@ -296,7 +303,7 @@ public static String deduplicateTtmlFile(final File subtitleFile) { } public static String deduplicateContent(final String subtitleContent) { - // Subtitle entries are considered duplicated only if: + // Subtitle entries/paragraphs are considered duplicated only if: // 1) begin timestamp is exactly the same, // 2) end timestamp is exactly the same, // 3) subtitle text content is the same @@ -360,25 +367,89 @@ private static String getSubtitleKeyOfTtml(final Matcher matcher) { final String begin = matcher.group(1).trim(); final String end = matcher.group(2).trim(); + // Textual content units inside the

element. + // It may contain tags/attributes. + final String rawTextualContent = matcher.group(3); + + String textContent = null; // Normalize subtitle text before comparison: - // - Leading and trailing whitespace is ignored - // - Runs of whitespace are collapsed into a single space (' ') // // Note: // This operates on raw TTML text as received (before XML entity decoding). // XML-encoded whitespace (e.g. ) is not decoded at this stage. // - // This is intentional: visually identical subtitles may differ only - // in whitespace due to formatting or extraction differences, and - // should be considered duplicates in such cases. - final String content = matcher.group(3) - .trim() - .replaceAll("\\s+", " "); - - final String key = begin + "|" + end + "|" + content; + if (!SUPPORT_STYLED_SUBTITLE_RENDERING) { + // Purpose: + // Some subtitles have the same text but different style + // attributes (e.g., colors, bold). + // If NewPipe does not support styled subtitle rendering, + // style attributes are meaningless, so they are ignored + // during deduplication. + // + // Example: + //

+ // Magic + //

+ //

+ // Magic + //

+ // These two subtitles have the same visible text but + // different style attributes. They will be considered + // duplicates after stripping style tags. + // + // Note: + // It may still contain
tags, which we intentionally + // keep for semantic meaning. + final String textWithoutStyles = stripStyleTags(rawTextualContent); + textContent = normalizeParagraphText(textWithoutStyles); + } else { + textContent = normalizeParagraphText(rawTextualContent); + } + + final String key = begin + "|" + end + "|" + textContent; return key; } + private static String stripStyleTags(final String textualContent) { + return textualContent + .replaceAll("]*>", "") + .replaceAll("", ""); + } + + private static String normalizeParagraphText(final String textContent) { + if (textContent == null) { + return ""; + } + + final String normalized = textContent + // Remove invisible Unicode characters + // Reason: + // Two subtitle entries may look the same visually, but + // they may differ in code due to invisible characters. + // Removing them ensures proper detection of duplicated + // subtitles. + // Covered characters: + // - Zero-width spaces and related characters (U+200B to U+200D) + // - Directionality control characters (U+200E, U+200F) + // - Directionality formatting characters (U+202A to U+202E) + // - Byte Order Mark (BOM, U+FEFF) + .replaceAll("[\\u200B-\\u200F\\u202A-\\u202E\\uFEFF]", "") + + // normalize non-breaking space to normal space + .replace('\u00A0', ' ') + + // Runs of whitespace are collapsed into a single space (' ') + // This is intentional: visually identical subtitles + // may differ only in whitespace due to formatting or + // extraction, and should still be considered duplicates. + .replaceAll("\\s+", " ") + + // Leading and trailing whitespace is ignored + .trim(); + + return normalized; + } + private static String buildLocalFileUri(final File subtitleCacheFile) { final String path = LOCAL_SUBTITLE_URL_PREFIX + subtitleCacheFile.getAbsolutePath(); From cc6e119688cc04ace734e9407d5f2eea8b196da6 Mon Sep 17 00:00:00 2001 From: TransZAllen <49811617+TransZAllen@users.noreply.github.com> Date: Wed, 1 Apr 2026 16:15:07 +0800 Subject: [PATCH 12/13] [duplicated subtitle] Refactor SubtitleDeduplicator for readability, without changing logic MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Rename methods: - getSubtitleKeyOfTtml() → buildDeduplicationKey() - storeItToCacheDir() → writeContentToCacheFile() - Rename variables: - subtitleContent → ttmlFileContent - seen → processedKeys - subCacheDir → SUBTITLE_DEDUP_CACHE_DIR - Improve deduplicateContent() by clarifying variable names and adding comments - Add comments to explain the logic (No functional changes) --- .../util/subtitle/SubtitleDeduplicator.java | 108 +++++++++++------- 1 file changed, 65 insertions(+), 43 deletions(-) diff --git a/app/src/main/java/org/schabi/newpipe/util/subtitle/SubtitleDeduplicator.java b/app/src/main/java/org/schabi/newpipe/util/subtitle/SubtitleDeduplicator.java index fd644dc6fab..549f20c8739 100644 --- a/app/src/main/java/org/schabi/newpipe/util/subtitle/SubtitleDeduplicator.java +++ b/app/src/main/java/org/schabi/newpipe/util/subtitle/SubtitleDeduplicator.java @@ -59,7 +59,7 @@ public final class SubtitleDeduplicator { // in consecutive subtitle entries. private static final boolean SUPPORT_STYLED_SUBTITLE_RENDERING = false; - private static String subCacheDir = "subtitle_cache"; + private static final String SUBTITLE_DEDUP_CACHE_DIR = "subtitle_cache"; private static File cacheDir = null; @@ -67,13 +67,14 @@ private SubtitleDeduplicator() { // no instance } - // cacheDir is /storage/emulated/0/Android/data//cache/{subCacheDir} + // cacheDir is /storage/emulated/0/Android/data// + // cache/{SUBTITLE_DEDUP_CACHE_DIR} public static void setCacheDirPath(final String path) { if (stringIsNullOrEmpty(path)) { return; } - cacheDir = new File(path, subCacheDir); + cacheDir = new File(path, SUBTITLE_DEDUP_CACHE_DIR); createDirIfNotExist(cacheDir); } @@ -136,7 +137,7 @@ public static String checkAndDeduplicate(final String remoteSubtitleUrl, currentSubtitleOrigin, currentSubtitleState); - final String localSubtitleUri = storeItToCacheDir(finalContent, + final String localSubtitleUri = writeContentToCacheFile(finalContent, format, currentSubtitleOrigin, currentCacheFile); @@ -257,21 +258,21 @@ public static boolean containsDuplicateTtmlEntries(final File subtitleFile) { // using the same normalized (whitespace-trimmed) comparison rules // as deduplicateContent(). // Note: entry == paragraph - public static boolean containsDuplicatedEntries(final String subtitleContent) { - if (stringIsNullOrEmpty(subtitleContent)) { + public static boolean containsDuplicatedEntries(final String ttmlFileContent) { + if (stringIsNullOrEmpty(ttmlFileContent)) { return false; } - final Matcher matcher = getTtmlMatcher(subtitleContent); + final Matcher matcher = getTtmlMatcher(ttmlFileContent); - final Set seen = new HashSet<>(); + final Set processedKeys = new HashSet<>(); while (matcher.find()) { - final String key = getSubtitleKeyOfTtml(matcher); + final String currentParagraphKey = buildDeduplicationKey(matcher); - if (seen.contains(key)) { + if (processedKeys.contains(currentParagraphKey)) { return true; } - seen.add(key); + processedKeys.add(currentParagraphKey); } return false; @@ -302,7 +303,7 @@ public static String deduplicateTtmlFile(final File subtitleFile) { } } - public static String deduplicateContent(final String subtitleContent) { + public static String deduplicateContent(final String ttmlFileContent) { // Subtitle entries/paragraphs are considered duplicated only if: // 1) begin timestamp is exactly the same, // 2) end timestamp is exactly the same, @@ -312,30 +313,50 @@ public static String deduplicateContent(final String subtitleContent) { // This is a normalized comparison (trimmed and whitespace-normalized). // No semantic analysis or fuzzy matching is performed. - if (stringIsNullOrEmpty(subtitleContent)) { - return subtitleContent; + if (stringIsNullOrEmpty(ttmlFileContent)) { + return ttmlFileContent; } - final Matcher matcher = getTtmlMatcher(subtitleContent); - - final Set seen = new HashSet<>(); + final Set processedKeys = new HashSet<>(); final StringBuilder result = new StringBuilder(); - int lastIndex = 0; - while (matcher.find()) { - result.append(subtitleContent, lastIndex, matcher.start()); + // Create a matcher for all

...

entries + final Matcher matcher = getTtmlMatcher(ttmlFileContent); - final String key = getSubtitleKeyOfTtml(matcher); + // Keep track of the end index of the last processed

+ int lastParagraphEndIndex = 0; - if (!seen.contains(key)) { - result.append(matcher.group(0)); - seen.add(key); + while (matcher.find()) { + // Extract the gap between the previous

and the current

+ // - it may contain whitespace, newlines, or other XML elements. + // - it is NOT part of the subtitle paragraph. + // - It is never used for deduplication or screen display. + final String gapBetweenParagraphs = ttmlFileContent.substring( + lastParagraphEndIndex, + matcher.start() + ); + result.append(gapBetweenParagraphs); + + final String currentParagraph = matcher.group(0); + final String currentParagraphKey = buildDeduplicationKey(matcher); + + if (!processedKeys.contains(currentParagraphKey)) { + // Append the ORIGINAL full

paragraph. + // - This preserves the author's original formatting + // (runs of whitespace,
, etc.). + result.append(currentParagraph); + processedKeys.add(currentParagraphKey); } - lastIndex = matcher.end(); + // Move the last processed index to the end of the current

+ lastParagraphEndIndex = matcher.end(); } - result.append(subtitleContent.substring(lastIndex)); + // Append any remaining content after the last

. + // - Usually contains closing tags like , , . + final String trailingContent = ttmlFileContent.substring(lastParagraphEndIndex); + result.append(trailingContent); + return result.toString(); } @@ -363,7 +384,18 @@ private static Matcher getTtmlMatcher(final String subtitleContent) { return pattern.matcher(subtitleContent); } - private static String getSubtitleKeyOfTtml(final Matcher matcher) { + /** + * Generates a deduplication key for one TTML {@code

} paragraph. + * + * @param matcher Matcher already positioned on a single {@code

} element. + * group(1) = begin time + * group(2) = end time + * group(3) = raw textual content (may contain 'span' tags) + * @return a deduplication key composed of begin/end timestamps + * and normalized text, used to detect whether this subtitle entry + * has already been processed. + */ + private static String buildDeduplicationKey(final Matcher matcher) { final String begin = matcher.group(1).trim(); final String end = matcher.group(2).trim(); @@ -456,22 +488,20 @@ private static String buildLocalFileUri(final File subtitleCacheFile) { return path; } - private static String storeItToCacheDir(final String subtitleContent, + private static String writeContentToCacheFile(final String subtitleContent, final MediaFormat format, final SubtitleOrigin currentSubtitleOrigin, final File currentCacheFile) { - final File cacheFile = currentCacheFile; + final String cacheFilePathForExoplayer = buildLocalFileUri(currentCacheFile); - final String cacheFilePathForExoplayer = buildLocalFileUri(cacheFile); - - if (!ensureItsParentDirExist(cacheFile)) { + if (!ensureItsParentDirExist(currentCacheFile)) { return null; } - if (null == writeDeduplicatedContentToCachefile(subtitleContent, cacheFile)) { + if (null == writeContentToFile(subtitleContent, currentCacheFile)) { return cacheFilePathForExoplayer; } else { - Log.e(TAG, "Failed to write cache file: " + cacheFile.getAbsolutePath()); + Log.e(TAG, "Failed to write cache file: " + currentCacheFile.getAbsolutePath()); return null; } } @@ -526,9 +556,7 @@ private static String buildSubtitleCacheFilename( } private static String getLanguageCode(final String remoteSubtitleUrl) { - String languageCode = null; - languageCode = YoutubeParsingHelper.extractLanguageCode(remoteSubtitleUrl); - return languageCode; + return YoutubeParsingHelper.extractLanguageCode(remoteSubtitleUrl); } private static String getAutoTranslateLanguage(final String remoteSubtitleUrl) { @@ -653,12 +681,6 @@ private static boolean ensureItsParentDirExist(final File tempCacheFile) { } } - private static String writeDeduplicatedContentToCachefile( - final String subtitleContent, - final File tempCacheFile) { - return writeContentToFile(subtitleContent, tempCacheFile); - } - private static String writeContentToFile(final String content, final File tempFile) { try (BufferedWriter writer = new BufferedWriter(new OutputStreamWriter( From a87a8da7c62f1a9c7ee00d91a0df3c059bdefa94 Mon Sep 17 00:00:00 2001 From: TransZAllen <49811617+TransZAllen@users.noreply.github.com> Date: Wed, 1 Apr 2026 16:15:37 +0800 Subject: [PATCH 13/13] [duplicated subtitle] perf(SubtitleDeduplicator): compile TTML

pattern once for efficiency - Replace defineTtmlSubtitlePattern() method with a static final Pattern - getTtmlMatcher() now reuses the precompiled pattern instead of recompiling each call - Improves performance when processing multiple subtitles without changing behavior --- .../newpipe/util/subtitle/SubtitleDeduplicator.java | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/app/src/main/java/org/schabi/newpipe/util/subtitle/SubtitleDeduplicator.java b/app/src/main/java/org/schabi/newpipe/util/subtitle/SubtitleDeduplicator.java index 549f20c8739..94467e7cb98 100644 --- a/app/src/main/java/org/schabi/newpipe/util/subtitle/SubtitleDeduplicator.java +++ b/app/src/main/java/org/schabi/newpipe/util/subtitle/SubtitleDeduplicator.java @@ -372,15 +372,13 @@ private static boolean stringIsNullOrEmpty(final String inputString) { return false; } - private static Pattern defineTtmlSubtitlePattern() { - return Pattern.compile( - "]*begin=\"([^\"]+)\"[^>]*end=\"([^\"]+)\"[^>]*>(.*?)

", - Pattern.DOTALL - ); - } + private static final Pattern TTML_PARAGRAPH_PATTERN = Pattern.compile( + "]*begin=\"([^\"]+)\"[^>]*end=\"([^\"]+)\"[^>]*>(.*?)

", + Pattern.DOTALL + ); private static Matcher getTtmlMatcher(final String subtitleContent) { - final Pattern pattern = defineTtmlSubtitlePattern(); + final Pattern pattern = TTML_PARAGRAPH_PATTERN; return pattern.matcher(subtitleContent); }