diff --git a/app/src/main/java/org/schabi/newpipe/App.kt b/app/src/main/java/org/schabi/newpipe/App.kt index 3ca259528ac..dd1561f36c8 100644 --- a/app/src/main/java/org/schabi/newpipe/App.kt +++ b/app/src/main/java/org/schabi/newpipe/App.kt @@ -34,12 +34,14 @@ import org.schabi.newpipe.extractor.services.youtube.extractors.YoutubeStreamExt import org.schabi.newpipe.ktx.hasAssignableCause import org.schabi.newpipe.settings.NewPipeSettings import org.schabi.newpipe.util.BridgeStateSaverInitializer +import org.schabi.newpipe.util.CacheDirUtils import org.schabi.newpipe.util.Localization import org.schabi.newpipe.util.ServiceHelper import org.schabi.newpipe.util.StateSaver import org.schabi.newpipe.util.image.ImageStrategy import org.schabi.newpipe.util.image.PreferredImageQuality import org.schabi.newpipe.util.potoken.PoTokenProviderImpl +import org.schabi.newpipe.util.subtitle.SubtitleDeduplicator /* * Copyright (C) Hans-Christoph Steiner 2016 @@ -93,6 +95,8 @@ open class App : .getInt(getString(R.string.last_used_preferences_version), -1) isFirstRun = lastUsedPrefVersion == -1 + val appCacheDirPath = CacheDirUtils.getPreferredAppCacheDirPath(this) + // Initialize settings first because other initializations can use its values NewPipeSettings.initSettings(this) @@ -124,6 +128,8 @@ open class App : configureRxJavaErrorHandler() YoutubeStreamExtractor.setPoTokenProvider(PoTokenProviderImpl) + + SubtitleDeduplicator.setCacheDirPath(appCacheDirPath) } override fun newImageLoader(context: Context): ImageLoader = ImageLoader diff --git a/app/src/main/java/org/schabi/newpipe/player/resolver/VideoPlaybackResolver.java b/app/src/main/java/org/schabi/newpipe/player/resolver/VideoPlaybackResolver.java index 670c13934df..897b1605c11 100644 --- a/app/src/main/java/org/schabi/newpipe/player/resolver/VideoPlaybackResolver.java +++ b/app/src/main/java/org/schabi/newpipe/player/resolver/VideoPlaybackResolver.java @@ -21,6 +21,7 @@ import org.schabi.newpipe.player.helper.PlayerHelper; import org.schabi.newpipe.player.mediaitem.MediaItemTag; import org.schabi.newpipe.player.mediaitem.StreamInfoTag; +import org.schabi.newpipe.streams.AppStreamInfo; import org.schabi.newpipe.util.ListHelper; import java.util.ArrayList; @@ -136,7 +137,8 @@ public MediaSource resolve(@NonNull final StreamInfo info) { // Below are auxiliary media sources // Create subtitle sources - final List subtitlesStreams = info.getSubtitles(); + final AppStreamInfo appInfo = AppStreamInfo.from(info); + final List subtitlesStreams = appInfo.loadNormalizedSubtitles(); if (subtitlesStreams != null) { // Torrent and non URL subtitles are not supported by ExoPlayer final List nonTorrentAndUrlStreams = getUrlAndNonTorrentStreams( diff --git a/app/src/main/java/org/schabi/newpipe/streams/AppStreamInfo.java b/app/src/main/java/org/schabi/newpipe/streams/AppStreamInfo.java new file mode 100644 index 00000000000..9cf1df7a18d --- /dev/null +++ b/app/src/main/java/org/schabi/newpipe/streams/AppStreamInfo.java @@ -0,0 +1,109 @@ +package org.schabi.newpipe.streams; + +import androidx.annotation.NonNull; +import androidx.annotation.Nullable; + +import android.util.Log; + +import org.schabi.newpipe.extractor.stream.StreamInfo; +import org.schabi.newpipe.extractor.stream.SubtitlesStream; +import org.schabi.newpipe.extractor.MediaFormat; +import org.schabi.newpipe.util.subtitle.SubtitleDeduplicator; +import org.schabi.newpipe.util.subtitle.SubtitleOrigin; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +public final class AppStreamInfo { + private static final String TAG = AppStreamInfo.class.getSimpleName(); + + @NonNull + private final StreamInfo originalStreamInfo; + @Nullable + private List normalizedSubtitles = null; + + private AppStreamInfo(@NonNull final StreamInfo original) { + this.originalStreamInfo = original; + } + + // Factory method: build AppStreamInfo from raw StreamInfo. + @NonNull + public static AppStreamInfo from(@NonNull final StreamInfo info) { + return new AppStreamInfo(info); + } + + @NonNull + public StreamInfo getOriginal() { + return originalStreamInfo; + } + + @NonNull + public List loadNormalizedSubtitles() { + if (null == normalizedSubtitles) { + final List originalSubtitles = + originalStreamInfo.getSubtitles(); + + normalizedSubtitles = + deduplicateSubtitles(originalSubtitles); + } + + return normalizedSubtitles; + } + + @NonNull + private static List deduplicateSubtitles( + @Nullable final List originalSubtitles) { + + if ((null == originalSubtitles) || originalSubtitles.isEmpty()) { + return Collections.emptyList(); + } + + final List newSubtitles = new ArrayList<>(); + + for (final SubtitlesStream oldSubtitle : originalSubtitles) { + final MediaFormat format = oldSubtitle.getFormat(); + if (null == format) { + newSubtitles.add(oldSubtitle); + continue; + } + + try { + final SubtitleOrigin origin = + SubtitleDeduplicator.getSubtitleOrigin( + oldSubtitle.isAutoGenerated(), + false + ); + + final String remoteSubtitleUrl = oldSubtitle.getContent(); + + final String subtitleUrl = + SubtitleDeduplicator.checkAndDeduplicate( + remoteSubtitleUrl, + format, + origin + ); + + if (remoteSubtitleUrl.equals(subtitleUrl)) { + newSubtitles.add(oldSubtitle); + } else { + final SubtitlesStream oneNewSubtitle = + new SubtitlesStream.Builder() + .setContent(subtitleUrl, true) + .setMediaFormat(format) + .setLanguageCode(oldSubtitle.getLanguageTag()) + .setAutoGenerated(oldSubtitle.isAutoGenerated()) + .build(); + + newSubtitles.add(oneNewSubtitle); + } + } catch (final Exception e) { + Log.w(TAG, "Subtitle deduplication failed", e); + newSubtitles.add(oldSubtitle); + } + } + + return newSubtitles; + } + +} diff --git a/app/src/main/java/org/schabi/newpipe/util/CacheDirUtils.java b/app/src/main/java/org/schabi/newpipe/util/CacheDirUtils.java new file mode 100644 index 00000000000..dc6a0978422 --- /dev/null +++ b/app/src/main/java/org/schabi/newpipe/util/CacheDirUtils.java @@ -0,0 +1,66 @@ +package org.schabi.newpipe.util; + +import java.io.File; + +import android.content.Context; +import androidx.annotation.NonNull; + +public final class CacheDirUtils { + + private CacheDirUtils() { + // no instance + } + + public static String getExternalAppCacheDirPath( + @NonNull final Context context) { + final File externalCacheDir = context.getExternalCacheDir(); + if (null != externalCacheDir) { + // /storage/emulated/0/Android/data//cache/ + return externalCacheDir.getAbsolutePath(); + } + + return null; + } + + public static String getInternalAppCacheDirPath( + @NonNull final Context context) { + // always available, never be 'null' + // /data/user/0//cache/ + return context.getCacheDir().getAbsolutePath(); + } + + /** + * Returns the preferred cache directory path for the application. + * + * Prefers the external cache directory when available + * (user-accessible, larger space), + * falls back to the internal private cache directory otherwise + * (always available, more secure). + * + * Typical paths: + * - External: /storage/emulated/0/Android/data//cache/ + * - Internal: /data/user/0//cache/ + * (or /data/data//cache/ on some devices) + * + * Note: The 'external' and 'internal' cache directories mentioned above + * are Android terms. They are typically located on the device's + * built-in storage and are not related to removable SD/TF cards. + * + * User "Clear Cache" in app settings deletes files in both locations. + * + * @param context used to get the available cache dir + * @return absolute path string, never null + */ + @NonNull + public static String getPreferredAppCacheDirPath( + @NonNull final Context context) { + + final String externalCacheDirPath = getExternalAppCacheDirPath(context); + if (null != externalCacheDirPath) { + return externalCacheDirPath; + } + + // Internal cache dir should always be available + return getInternalAppCacheDirPath(context); + } +} diff --git a/app/src/main/java/org/schabi/newpipe/util/subtitle/SubtitleDeduplicator.java b/app/src/main/java/org/schabi/newpipe/util/subtitle/SubtitleDeduplicator.java new file mode 100644 index 00000000000..94467e7cb98 --- /dev/null +++ b/app/src/main/java/org/schabi/newpipe/util/subtitle/SubtitleDeduplicator.java @@ -0,0 +1,695 @@ +package org.schabi.newpipe.util.subtitle; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileReader; +import java.io.BufferedWriter; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.OutputStreamWriter; +import java.nio.charset.StandardCharsets; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import java.net.MalformedURLException; +import java.net.URL; + +import androidx.annotation.NonNull; + +import android.util.Log; + +import org.schabi.newpipe.extractor.services.youtube.YoutubeParsingHelper; +import org.schabi.newpipe.extractor.MediaFormat; +import org.schabi.newpipe.extractor.NewPipe; +import org.schabi.newpipe.extractor.downloader.Downloader; +import org.schabi.newpipe.extractor.downloader.Response; +import org.schabi.newpipe.extractor.exceptions.ReCaptchaException; + +/** + * SubtitleDeduplicator.java + * + * 1. This file is responsible for checking if the subtitles + * contain any duplicate entries. + * a) If duplicates are found, it performs the following steps: + * downloads the subtitle (TTML format), deduplicates it, + * and stores it locally. + * b) If no duplicates are found, downloads and stores it. + * + * 2. Core Functions: + * - checkAndDeduplicate(): Checks for duplicate subtitles + * and handles downloading, deduplication, and local storage. + * + */ + +public final class SubtitleDeduplicator { + private static final String TAG = "SubtitleDeduplicator"; + public static final String LOCAL_SUBTITLE_URL_PREFIX = "file://"; + + private static final float BACKOFF_FACTOR = 1.0f; + + // Once NewPipe/ExoPlayer supports styled subtitle rendering + // (e.g., colors, bold, background), set this to 'true' + // to preserve different styles for the same subtitle text + // in consecutive subtitle entries. + private static final boolean SUPPORT_STYLED_SUBTITLE_RENDERING = false; + + private static final String SUBTITLE_DEDUP_CACHE_DIR = "subtitle_cache"; + + private static File cacheDir = null; + + private SubtitleDeduplicator() { + // no instance + } + + // cacheDir is /storage/emulated/0/Android/data// + // cache/{SUBTITLE_DEDUP_CACHE_DIR} + public static void setCacheDirPath(final String path) { + if (stringIsNullOrEmpty(path)) { + return; + } + + cacheDir = new File(path, SUBTITLE_DEDUP_CACHE_DIR); + + createDirIfNotExist(cacheDir); + } + + // Returns either a remote subtitle URL or a local file URI (file://) + // @param remoteSubtitleUrl: A valid YouTube subtitle URL, expected to + // contain videoId and languageCode parameters. + public static String checkAndDeduplicate(final String remoteSubtitleUrl, + final MediaFormat format, + final SubtitleOrigin currentSubtitleOrigin) { + // Subtitle deduplication relies on YouTube-specific subtitle URL semantics + // (videoId, languageCode, translationCode) which are used for cache file naming. + // For non-YouTube URLs, the original subtitle is returned unchanged. + if (!isYoutubeRelatedUrl(remoteSubtitleUrl)) { + return remoteSubtitleUrl; + } + + if (!isCacheDirAvailable()) { + printCacheDirNotInitialized(); + return remoteSubtitleUrl; + } + // *** Step 1: Download remote subtitle content + + // - The remote subtitle is ALWAYS downloaded to ensure + // the newest version is used. + // - Although cached subtitles are available, they may be + // outdated since the video creator or the YouTube + // platform can update them. + + // Current subtitle format is TTML + final String downloadedContent = downloadRemoteSubtitleContent( + remoteSubtitleUrl, + currentSubtitleOrigin, + 3, + 1000); + + if (subtitleDownloadFails(downloadedContent)) { + return fallbackToStoredOrRemote(remoteSubtitleUrl, + format, + currentSubtitleOrigin); + } + + String finalContent = null; + SubtitleState currentSubtitleState = SubtitleState.ORIGINAL; + + // *** Step 2: Detect and deduplicate if needed + + if (containsDuplicatedEntries(downloadedContent)) { + finalContent = deduplicateContent(downloadedContent); + currentSubtitleState = SubtitleState.DEDUPLICATED; + } else { + finalContent = downloadedContent; + currentSubtitleState = SubtitleState.ORIGINAL; + } + + // *** Step 3: Store subtitle to cache and return local URI if possible + + final File currentCacheFile = getCacheFile(remoteSubtitleUrl, + format, + currentSubtitleOrigin, + currentSubtitleState); + + final String localSubtitleUri = writeContentToCacheFile(finalContent, + format, + currentSubtitleOrigin, + currentCacheFile); + + if (subtitleStorageFails(localSubtitleUri)) { + return fallbackToStoredOrRemote(remoteSubtitleUrl, + format, + currentSubtitleOrigin); + } + + return localSubtitleUri; + } + + private static boolean isCacheDirAvailable() { + if (null == cacheDir) { + return false; + } + + return createDirIfNotExist(cacheDir); + } + + private static boolean createDirIfNotExist(final File directory) { + if (!directory.exists()) { + directory.mkdirs(); + } + + return ((directory.exists()) && (directory.isDirectory())); + } + + private static void printCacheDirNotInitialized() { + final String errorMessage = + "SubtitleDeduplicator cache directory is not initialized. " + + "Fallback to original subtitle without deduplication. " + + "setCacheDirPath() should be called before using this class."; + + Log.w(TAG, errorMessage); + } + + private static String downloadRemoteSubtitleContent(final String urlStr, + final SubtitleOrigin currentOrigin, + final int maxRetries, + final int initialDelayMillis) { + final Downloader downloader = NewPipe.getDownloader(); + if (downloader == null) { + Log.w(TAG, "Downloader not initialized- cannot download subtitles"); + return null; + } + // if auto-translate language subtitle, use the bigger data. + int delay = resolveDelay(currentOrigin, initialDelayMillis); + for (int attempt = 1; attempt <= maxRetries; attempt++) { + try { + final Map> headers = new HashMap<>(); + headers.put("Accept", Collections.singletonList("text/*")); + headers.put("Accept-Language", Collections.singletonList("en-US,en;q=0.9")); + final Response response = downloader.get(urlStr, headers); + if (response.responseCode() == 200) { + return response.responseBody(); + } else { + Log.w(TAG, "Attempt " + attempt + " failed with status: " + + response.responseCode() + " URL: " + urlStr); + if (response.responseCode() != 503 && response.responseCode() != 429) { + return null; + } + } + } catch (IOException | ReCaptchaException e) { + Log.w(TAG, "Attempt " + attempt + " failed for URL: " + urlStr, e); + } + if (attempt < maxRetries) { + try { + Thread.sleep(delay); + delay = adjustDelayAfterRetry(delay); + } catch (final InterruptedException ie) { + Thread.currentThread().interrupt(); + return null; + } + } + } + Log.e(TAG, "Failed to download subtitle after " + maxRetries + + " attempts. URL: " + urlStr); + return null; + } + + private static boolean isAutoTranslateSubtitle(final SubtitleOrigin currentOrigin) { + return (currentOrigin == SubtitleOrigin.AUTO_TRANSLATED); + } + + private static int resolveDelay(final SubtitleOrigin currentOrigin, + final int baseDelayMillis) { + if (isAutoTranslateSubtitle(currentOrigin)) { + // Auto-translated subtitles are observed to be less reliable. + // A separate delay path is kept to allow future tuning without + // affecting the common subtitle download flow. + return (baseDelayMillis + 1); + } else { + return baseDelayMillis; + } + } + + private static int adjustDelayAfterRetry(final int currentDelayMillis) { + return (int) (currentDelayMillis * BACKOFF_FACTOR); + } + + public static boolean containsDuplicateTtmlEntries(final File subtitleFile) { + if (subtitleFile == null || !subtitleFile.exists()) { + return false; + } + + try { + final String content = readFileToString(subtitleFile); + return containsDuplicatedEntries(content); + } catch (final IOException e) { + e.printStackTrace(); + return false; + } + } + + // Detects whether the subtitle contains duplicated

entries + // using the same normalized (whitespace-trimmed) comparison rules + // as deduplicateContent(). + // Note: entry == paragraph + public static boolean containsDuplicatedEntries(final String ttmlFileContent) { + if (stringIsNullOrEmpty(ttmlFileContent)) { + return false; + } + + final Matcher matcher = getTtmlMatcher(ttmlFileContent); + + final Set processedKeys = new HashSet<>(); + while (matcher.find()) { + final String currentParagraphKey = buildDeduplicationKey(matcher); + + if (processedKeys.contains(currentParagraphKey)) { + return true; + } + processedKeys.add(currentParagraphKey); + } + + return false; + } + + private static String readFileToString(final File file) throws IOException { + final StringBuilder sb = new StringBuilder(); + try (BufferedReader reader = new BufferedReader(new FileReader(file))) { + String line; + while ((line = reader.readLine()) != null) { + sb.append(line).append("\n"); + } + } + return sb.toString(); + } + + public static String deduplicateTtmlFile(final File subtitleFile) { + if (subtitleFile == null || !subtitleFile.exists()) { + return ""; + } + + try { + final String content = readFileToString(subtitleFile); + return deduplicateContent(content); + } catch (final IOException e) { + e.printStackTrace(); + return ""; + } + } + + public static String deduplicateContent(final String ttmlFileContent) { + // Subtitle entries/paragraphs are considered duplicated only if: + // 1) begin timestamp is exactly the same, + // 2) end timestamp is exactly the same, + // 3) subtitle text content is the same + // after normalized (trimming and whitespace normalization). + // + // This is a normalized comparison (trimmed and whitespace-normalized). + // No semantic analysis or fuzzy matching is performed. + + if (stringIsNullOrEmpty(ttmlFileContent)) { + return ttmlFileContent; + } + + final Set processedKeys = new HashSet<>(); + final StringBuilder result = new StringBuilder(); + + // Create a matcher for all

...

entries + final Matcher matcher = getTtmlMatcher(ttmlFileContent); + + // Keep track of the end index of the last processed

+ int lastParagraphEndIndex = 0; + + while (matcher.find()) { + // Extract the gap between the previous

and the current

+ // - it may contain whitespace, newlines, or other XML elements. + // - it is NOT part of the subtitle paragraph. + // - It is never used for deduplication or screen display. + final String gapBetweenParagraphs = ttmlFileContent.substring( + lastParagraphEndIndex, + matcher.start() + ); + result.append(gapBetweenParagraphs); + + final String currentParagraph = matcher.group(0); + final String currentParagraphKey = buildDeduplicationKey(matcher); + + if (!processedKeys.contains(currentParagraphKey)) { + // Append the ORIGINAL full

paragraph. + // - This preserves the author's original formatting + // (runs of whitespace,
, etc.). + result.append(currentParagraph); + processedKeys.add(currentParagraphKey); + } + + // Move the last processed index to the end of the current

+ lastParagraphEndIndex = matcher.end(); + } + + // Append any remaining content after the last

. + // - Usually contains closing tags like , , . + final String trailingContent = ttmlFileContent.substring(lastParagraphEndIndex); + result.append(trailingContent); + + return result.toString(); + } + + private static boolean stringIsNullOrEmpty(final String inputString) { + if (null == inputString) { + return true; + } + + if (inputString.isEmpty()) { + return true; + } + + return false; + } + + private static final Pattern TTML_PARAGRAPH_PATTERN = Pattern.compile( + "]*begin=\"([^\"]+)\"[^>]*end=\"([^\"]+)\"[^>]*>(.*?)

", + Pattern.DOTALL + ); + + private static Matcher getTtmlMatcher(final String subtitleContent) { + final Pattern pattern = TTML_PARAGRAPH_PATTERN; + return pattern.matcher(subtitleContent); + } + + /** + * Generates a deduplication key for one TTML {@code

} paragraph. + * + * @param matcher Matcher already positioned on a single {@code

} element. + * group(1) = begin time + * group(2) = end time + * group(3) = raw textual content (may contain 'span' tags) + * @return a deduplication key composed of begin/end timestamps + * and normalized text, used to detect whether this subtitle entry + * has already been processed. + */ + private static String buildDeduplicationKey(final Matcher matcher) { + final String begin = matcher.group(1).trim(); + final String end = matcher.group(2).trim(); + + // Textual content units inside the

element. + // It may contain tags/attributes. + final String rawTextualContent = matcher.group(3); + + String textContent = null; + // Normalize subtitle text before comparison: + // + // Note: + // This operates on raw TTML text as received (before XML entity decoding). + // XML-encoded whitespace (e.g. ) is not decoded at this stage. + // + if (!SUPPORT_STYLED_SUBTITLE_RENDERING) { + // Purpose: + // Some subtitles have the same text but different style + // attributes (e.g., colors, bold). + // If NewPipe does not support styled subtitle rendering, + // style attributes are meaningless, so they are ignored + // during deduplication. + // + // Example: + //

+ // Magic + //

+ //

+ // Magic + //

+ // These two subtitles have the same visible text but + // different style attributes. They will be considered + // duplicates after stripping style tags. + // + // Note: + // It may still contain
tags, which we intentionally + // keep for semantic meaning. + final String textWithoutStyles = stripStyleTags(rawTextualContent); + textContent = normalizeParagraphText(textWithoutStyles); + } else { + textContent = normalizeParagraphText(rawTextualContent); + } + + final String key = begin + "|" + end + "|" + textContent; + return key; + } + + private static String stripStyleTags(final String textualContent) { + return textualContent + .replaceAll("]*>", "") + .replaceAll("", ""); + } + + private static String normalizeParagraphText(final String textContent) { + if (textContent == null) { + return ""; + } + + final String normalized = textContent + // Remove invisible Unicode characters + // Reason: + // Two subtitle entries may look the same visually, but + // they may differ in code due to invisible characters. + // Removing them ensures proper detection of duplicated + // subtitles. + // Covered characters: + // - Zero-width spaces and related characters (U+200B to U+200D) + // - Directionality control characters (U+200E, U+200F) + // - Directionality formatting characters (U+202A to U+202E) + // - Byte Order Mark (BOM, U+FEFF) + .replaceAll("[\\u200B-\\u200F\\u202A-\\u202E\\uFEFF]", "") + + // normalize non-breaking space to normal space + .replace('\u00A0', ' ') + + // Runs of whitespace are collapsed into a single space (' ') + // This is intentional: visually identical subtitles + // may differ only in whitespace due to formatting or + // extraction, and should still be considered duplicates. + .replaceAll("\\s+", " ") + + // Leading and trailing whitespace is ignored + .trim(); + + return normalized; + } + + private static String buildLocalFileUri(final File subtitleCacheFile) { + final String path = LOCAL_SUBTITLE_URL_PREFIX + subtitleCacheFile.getAbsolutePath(); + + return path; + } + + private static String writeContentToCacheFile(final String subtitleContent, + final MediaFormat format, + final SubtitleOrigin currentSubtitleOrigin, + final File currentCacheFile) { + final String cacheFilePathForExoplayer = buildLocalFileUri(currentCacheFile); + + if (!ensureItsParentDirExist(currentCacheFile)) { + return null; + } + + if (null == writeContentToFile(subtitleContent, currentCacheFile)) { + return cacheFilePathForExoplayer; + } else { + Log.e(TAG, "Failed to write cache file: " + currentCacheFile.getAbsolutePath()); + return null; + } + } + + // filename without dir path + private static String computeFilename(final String subtitleUrl, + final MediaFormat format, + final SubtitleOrigin currentSubtitleOrigin, + final SubtitleState currentSubtitleState) { + final String videoId = getVideoId(subtitleUrl); + + final String languageCode = resolveSubtitleLanguage( + subtitleUrl, + currentSubtitleOrigin + ); + + final String filename = buildSubtitleCacheFilename(videoId, + languageCode, + currentSubtitleOrigin, + currentSubtitleState, + format.getSuffix()); + + return filename; + } + + public static SubtitleOrigin getSubtitleOrigin(final boolean autoGenerated, + final boolean autoTranslate) { + if (autoTranslate) { + return SubtitleOrigin.AUTO_TRANSLATED; + } + if (autoGenerated) { + return SubtitleOrigin.AUTO_GENERATED; + } + return SubtitleOrigin.HUMAN_PROVIDED; + } + + @NonNull + private static String buildSubtitleCacheFilename( + @NonNull final String videoId, + @NonNull final String language, + @NonNull final SubtitleOrigin origin, + @NonNull final SubtitleState state, + @NonNull final String extension + ) { + final String filenamePartSeparator = "--"; + + return videoId + + filenamePartSeparator + language + + filenamePartSeparator + origin.getId() + + filenamePartSeparator + state.getId() + + "." + extension; + } + + private static String getLanguageCode(final String remoteSubtitleUrl) { + return YoutubeParsingHelper.extractLanguageCode(remoteSubtitleUrl); + } + + private static String getAutoTranslateLanguage(final String remoteSubtitleUrl) { + // For auto-translate subtitles Url, there are two language code in it: + // one is 'lang', now its meaning is source language; + // the other is 'tlang', its meaning is target language. + String targetAutoTranslate = null; + targetAutoTranslate = YoutubeParsingHelper.extractTranslationCode( + remoteSubtitleUrl + ); + return targetAutoTranslate; + } + + // For auto-translate subtitles, the cache filename language + // represents the target language (tlang), not the source language. + private static String resolveSubtitleLanguage( + final String subtitleUrl, + final SubtitleOrigin origin + ) { + if (origin == SubtitleOrigin.AUTO_TRANSLATED) { + final String targetLang = getAutoTranslateLanguage(subtitleUrl); + + if (!stringIsNullOrEmpty(targetLang)) { + return targetLang; + } else { + final String unknownLanguage = "unknownLanguage"; + return unknownLanguage; + } + } + + return getLanguageCode(subtitleUrl); + } + + // Extract the videoId (e.g., "b7vmW_5HSpE") from a subtitle URL + // (e.g., .../api/timedtext?v=b7vmW_5HSpE) + // for use in generating unique filenames. + private static String getVideoId(final String remoteSubtitleUrl) { + return YoutubeParsingHelper.extractVideoId(remoteSubtitleUrl); + } + + private static boolean isYoutubeRelatedUrl(@NonNull final String url) { + try { + final URL parsedUrl = new URL(url); + return (YoutubeParsingHelper.isYoutubeURL(parsedUrl) + || YoutubeParsingHelper.isYoutubeServiceURL(parsedUrl)); + } catch (final MalformedURLException e) { + return false; + } + } + + private static File getCacheFile(final String subtitleUrl, + final MediaFormat format, + final SubtitleOrigin currentSubtitleOrigin, + final SubtitleState currentSubtitleState) { + final String cachefilename = computeFilename(subtitleUrl, + format, + currentSubtitleOrigin, + currentSubtitleState); + + final File cacheFile = new File(cacheDir, cachefilename); + + return cacheFile; + } + + private static File findStoredCacheFile( + final String remoteSubtitleUrl, + final MediaFormat format, + final SubtitleOrigin currentSubtitleOrigin + ) { + for (final SubtitleState state : SubtitleState.values()) { + final File subtitleFile = getCacheFile( + remoteSubtitleUrl, + format, + currentSubtitleOrigin, + state + ); + + if (subtitleFile.exists() && subtitleFile.length() > 0) { + return subtitleFile; + } + } + + return null; + } + + @NonNull + private static String fallbackToStoredOrRemote( + @NonNull final String remoteSubtitleUrl, + @NonNull final MediaFormat format, + @NonNull final SubtitleOrigin origin + ) { + final File storedFile = findStoredCacheFile( + remoteSubtitleUrl, + format, + origin + ); + + if (storedFile != null) { + final String previousStoredUri = buildLocalFileUri(storedFile); + return previousStoredUri; + } + + return remoteSubtitleUrl; + } + + private static boolean subtitleDownloadFails(final String contentDownloaded) { + return (null == contentDownloaded); + } + + private static boolean subtitleStorageFails(final String localUriAfterStores) { + return (null == localUriAfterStores); + } + + private static boolean ensureItsParentDirExist(final File tempCacheFile) { + final File parentDir = tempCacheFile.getParentFile(); + + if (parentDir.exists()) { + return true; + } else { + final boolean result = parentDir.mkdirs(); + return result; + } + } + + private static String writeContentToFile(final String content, + final File tempFile) { + try (BufferedWriter writer = new BufferedWriter(new OutputStreamWriter( + new FileOutputStream(tempFile), StandardCharsets.UTF_8))) { + writer.write(content); + //ok + return null; + } catch (final IOException e) { + Log.e(TAG, "Failed to write cache file", e); + return e.getMessage(); + } + } + +} diff --git a/app/src/main/java/org/schabi/newpipe/util/subtitle/SubtitleOrigin.java b/app/src/main/java/org/schabi/newpipe/util/subtitle/SubtitleOrigin.java new file mode 100644 index 00000000000..12fe7a7fe8f --- /dev/null +++ b/app/src/main/java/org/schabi/newpipe/util/subtitle/SubtitleOrigin.java @@ -0,0 +1,38 @@ +package org.schabi.newpipe.util.subtitle; + +import androidx.annotation.NonNull; + +/** + * Describes the origin of a subtitle -> how its content was produced. + * + * - Currently, this enum covers YouTube subtitles, but + * it is designed to be extensible for other platforms + * (e.g. Peertube) in the future. + */ +public enum SubtitleOrigin { + + /** + * Currently, YouTube subtitles are categorized + * into three types: + * - Subtitles manually uploaded by the video creator + * (HUMAN_PROVIDED). + * - Subtitles automatically generated by YouTube's + * speech recognition (AUTO_GENERATED). + * - Subtitles automatically translated by YouTube + * from an existing subtitle track (AUTO_TRANSLATED). + */ + HUMAN_PROVIDED("human_provided"), + AUTO_GENERATED("auto_generated"), + AUTO_TRANSLATED("auto_translated"); + + private final String id; + + SubtitleOrigin(@NonNull final String id) { + this.id = id; + } + + @NonNull + public String getId() { + return id; + } +} diff --git a/app/src/main/java/org/schabi/newpipe/util/subtitle/SubtitleState.java b/app/src/main/java/org/schabi/newpipe/util/subtitle/SubtitleState.java new file mode 100644 index 00000000000..029ccc8fd3d --- /dev/null +++ b/app/src/main/java/org/schabi/newpipe/util/subtitle/SubtitleState.java @@ -0,0 +1,31 @@ +package org.schabi.newpipe.util.subtitle; + +import androidx.annotation.NonNull; + +/** + * Describes the processing state of a subtitle. + * + * - This enum represents whether the subtitle content + * is original or has been post-processed (e.g. deduplicated). + * - Unlike `enum SubtitleOrigin`, this does not describe + * how the subtitle was created, but how it has been + * processed locally. + */ +public enum SubtitleState { + + // Original subtitle content, no modifications + ORIGINAL("original"), + // Subtitle content after deduplication processing + DEDUPLICATED("deduplicated"); + + private final String id; + + SubtitleState(@NonNull final String id) { + this.id = id; + } + + @NonNull + public String getId() { + return id; + } +} diff --git a/app/src/main/java/us/shandian/giga/postprocessing/TtmlConverter.java b/app/src/main/java/us/shandian/giga/postprocessing/TtmlConverter.java index d723bfb4561..05f81d08d72 100644 --- a/app/src/main/java/us/shandian/giga/postprocessing/TtmlConverter.java +++ b/app/src/main/java/us/shandian/giga/postprocessing/TtmlConverter.java @@ -4,8 +4,12 @@ import org.schabi.newpipe.streams.SrtFromTtmlWriter; import org.schabi.newpipe.streams.io.SharpStream; +import org.schabi.newpipe.util.subtitle.SubtitleDeduplicator; +import java.nio.charset.StandardCharsets; import java.io.IOException; +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; /** * @author kapodamy @@ -23,12 +27,17 @@ int process(SharpStream out, SharpStream... sources) throws IOException { // check if the subtitle is already in srt and copy, this should never happen String format = getArgumentAt(0, null); boolean ignoreEmptyFrames = getArgumentAt(1, "true").equals("true"); - if (format == null || format.equals("ttml")) { SrtFromTtmlWriter writer = new SrtFromTtmlWriter(out, ignoreEmptyFrames); - try { - writer.build(sources[0]); + final String subtitleContent = + readSharpStreamToString(sources[0]); + final String deduplicated = + SubtitleDeduplicator.deduplicateContent(subtitleContent); + final SharpStream stream = + new ByteArraySharpStream( + deduplicated.getBytes(StandardCharsets.UTF_8)); + writer.build(stream); } catch (IOException err) { Log.e(TAG, "subtitle conversion failed due to I/O error", err); throw err; @@ -50,4 +59,107 @@ int process(SharpStream out, SharpStream... sources) throws IOException { throw new UnsupportedOperationException("Can't convert this subtitle, unimplemented format: " + format); } + private static String readSharpStreamToString(final SharpStream stream) throws IOException { + + final ByteArrayOutputStream out = new ByteArrayOutputStream(); + final byte[] buffer = new byte[8192]; + + int read; + + // Note: `> 0` is required here because ChunkFileInputStream.read() + // returns 0 at EOF instead of -1. Using `!= -1` would result in + // an infinite loop in that case. + // + // Standard Java InputStream.read() returns -1 at EOF. + // + // Reference implementation: + // - ChunkFileInputStream.java + // + // Future note: + // - If ChunkFileInputStream changes to return -1 at EOF, this loop + // can safely be switched back to `read != -1`. Keeping `> 0` is + // also safe and will continue to work. + while ((read = stream.read(buffer)) > 0) { + out.write(buffer, 0, read); + } + + final String result = out.toString(StandardCharsets.UTF_8); + + return result; + } + + /** + * Minimal SharpStream backed by a byte array. + */ + private static final class ByteArraySharpStream extends SharpStream { + private final ByteArrayInputStream in; + + ByteArraySharpStream(byte[] data) { + this.in = new ByteArrayInputStream(data); + } + + @Override + public int read() { + return in.read(); + } + + @Override + public int read(byte[] buffer) { + return in.read(buffer, 0, buffer.length); + } + + @Override + public int read(byte[] buffer, int offset, int count) { + return in.read(buffer, offset, count); + } + + @Override + public long skip(long amount) { + return in.skip(amount); + } + + @Override + public long available() { + return in.available(); + } + + @Override + public void rewind() { + in.reset(); + } + + @Override + public boolean isClosed() { + return false; + } + + @Override + public void close() {} + + @Override + public boolean canRewind() { return true; } + + @Override + public boolean canRead() { return true; } + + @Override + public boolean canWrite() { return false; } + + @Override + public void write(byte value) throws IOException { + // This stream is read-only + // and used only for reading subtitle data. + throw new IOException("Stream is read-only"); + } + + @Override + public void write(byte[] buffer) throws IOException { + throw new IOException("Stream is read-only"); + } + + @Override + public void write(byte[] buffer, int offset, int count) throws IOException { + throw new IOException("Stream is read-only"); + } + } } diff --git a/app/src/test/java/org/schabi/newpipe/util/subtitle/SubtitleDeduplicatorTest.java b/app/src/test/java/org/schabi/newpipe/util/subtitle/SubtitleDeduplicatorTest.java new file mode 100644 index 00000000000..4f47900dfa4 --- /dev/null +++ b/app/src/test/java/org/schabi/newpipe/util/subtitle/SubtitleDeduplicatorTest.java @@ -0,0 +1,98 @@ +package org.schabi.newpipe.util.subtitle; + +import org.junit.Test; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.assertFalse; + +public class SubtitleDeduplicatorTest { + + @Test + public void deduplicateExactDuplicateEntriesShouldRemoveDuplicate() { + final String input = + "

Hello

\n" + + "

Hello

"; + + final String output = SubtitleDeduplicator.deduplicateContent(input); + + final String expected = + "

Hello

"; + + // The `strip()` method is used here to remove the trailing + // newline character (\n, outside of

tags) at the end of the `output`. + // Removing this (\n) does not affect the TTML subtitle paragraphs, + // as only the content within

tags is considered valid for subtitles. + assertEquals(expected, output.strip()); + } + + @Test + public void deduplicateSameTimeDifferentTextShouldNotDeduplicate() { + final String input = + "

Hello

\n" + + "

World

"; + + final String output = SubtitleDeduplicator.deduplicateContent(input); + + final String expected = input; + + assertEquals(expected, output); + } + + @Test + public void deduplicateSameTextDifferentTimeShouldNotDeduplicate() { + final String input = + "

Hello

\n" + + "

Hello

"; + + final String output = SubtitleDeduplicator.deduplicateContent(input); + + final String expected = input; + + assertEquals(expected, output); + } + + @Test + public void containsDuplicatedEntriesExactDuplicateShouldReturnTrue() { + final String input = + "

Hello

\n" + + "

Hello

"; + + assertTrue(SubtitleDeduplicator.containsDuplicatedEntries(input)); + } + + @Test + public void containsDuplicatedEntriesNoDuplicateShouldReturnFalse() { + final String input = + "

Hello

\n" + + "

World

"; + + assertFalse(SubtitleDeduplicator.containsDuplicatedEntries(input)); + } + + @Test + public void containsDuplicatesNormalizeLeadingAndTrailingWhitespaceShouldConsiderAsSame() { + // Note: + // This test verifies that the deduplication logic normalizes + // leading and trailing whitespace, and considers the content + // as the same after this normalization, without modifying + // the original subtitle content. + final String input = + "

Hello world

\n" + + "

Hello world

"; + assertTrue(SubtitleDeduplicator.containsDuplicatedEntries(input)); + } + + @Test + public void containsDuplicatedEntriesNormalizeMultipleSpacesShouldConsiderAsSingleSpace() { + // Note: + // This test verifies that the deduplication logic normalizes + // multiple consecutive spaces into a single space, + // considering the content as the same after this normalization, + // without modifying the original subtitle content. + final String input = + "

Hello world

\n" + + "

Hello world

"; + assertTrue(SubtitleDeduplicator.containsDuplicatedEntries(input)); + } +}