Skip to content

Commit cc6e119

Browse files
committed
[duplicated subtitle] Refactor SubtitleDeduplicator for readability, without changing logic
- Rename methods: - getSubtitleKeyOfTtml() → buildDeduplicationKey() - storeItToCacheDir() → writeContentToCacheFile() - Rename variables: - subtitleContent → ttmlFileContent - seen → processedKeys - subCacheDir → SUBTITLE_DEDUP_CACHE_DIR - Improve deduplicateContent() by clarifying variable names and adding comments - Add comments to explain the logic (No functional changes)
1 parent 8d119f5 commit cc6e119

File tree

1 file changed

+65
-43
lines changed

1 file changed

+65
-43
lines changed

app/src/main/java/org/schabi/newpipe/util/subtitle/SubtitleDeduplicator.java

Lines changed: 65 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -59,21 +59,22 @@ public final class SubtitleDeduplicator {
5959
// in consecutive subtitle entries.
6060
private static final boolean SUPPORT_STYLED_SUBTITLE_RENDERING = false;
6161

62-
private static String subCacheDir = "subtitle_cache";
62+
private static final String SUBTITLE_DEDUP_CACHE_DIR = "subtitle_cache";
6363

6464
private static File cacheDir = null;
6565

6666
private SubtitleDeduplicator() {
6767
// no instance
6868
}
6969

70-
// cacheDir is /storage/emulated/0/Android/data/<package_name>/cache/{subCacheDir}
70+
// cacheDir is /storage/emulated/0/Android/data/<package_name>/
71+
// cache/{SUBTITLE_DEDUP_CACHE_DIR}
7172
public static void setCacheDirPath(final String path) {
7273
if (stringIsNullOrEmpty(path)) {
7374
return;
7475
}
7576

76-
cacheDir = new File(path, subCacheDir);
77+
cacheDir = new File(path, SUBTITLE_DEDUP_CACHE_DIR);
7778

7879
createDirIfNotExist(cacheDir);
7980
}
@@ -136,7 +137,7 @@ public static String checkAndDeduplicate(final String remoteSubtitleUrl,
136137
currentSubtitleOrigin,
137138
currentSubtitleState);
138139

139-
final String localSubtitleUri = storeItToCacheDir(finalContent,
140+
final String localSubtitleUri = writeContentToCacheFile(finalContent,
140141
format,
141142
currentSubtitleOrigin,
142143
currentCacheFile);
@@ -257,21 +258,21 @@ public static boolean containsDuplicateTtmlEntries(final File subtitleFile) {
257258
// using the same normalized (whitespace-trimmed) comparison rules
258259
// as deduplicateContent().
259260
// Note: entry == paragraph
260-
public static boolean containsDuplicatedEntries(final String subtitleContent) {
261-
if (stringIsNullOrEmpty(subtitleContent)) {
261+
public static boolean containsDuplicatedEntries(final String ttmlFileContent) {
262+
if (stringIsNullOrEmpty(ttmlFileContent)) {
262263
return false;
263264
}
264265

265-
final Matcher matcher = getTtmlMatcher(subtitleContent);
266+
final Matcher matcher = getTtmlMatcher(ttmlFileContent);
266267

267-
final Set<String> seen = new HashSet<>();
268+
final Set<String> processedKeys = new HashSet<>();
268269
while (matcher.find()) {
269-
final String key = getSubtitleKeyOfTtml(matcher);
270+
final String currentParagraphKey = buildDeduplicationKey(matcher);
270271

271-
if (seen.contains(key)) {
272+
if (processedKeys.contains(currentParagraphKey)) {
272273
return true;
273274
}
274-
seen.add(key);
275+
processedKeys.add(currentParagraphKey);
275276
}
276277

277278
return false;
@@ -302,7 +303,7 @@ public static String deduplicateTtmlFile(final File subtitleFile) {
302303
}
303304
}
304305

305-
public static String deduplicateContent(final String subtitleContent) {
306+
public static String deduplicateContent(final String ttmlFileContent) {
306307
// Subtitle entries/paragraphs are considered duplicated only if:
307308
// 1) begin timestamp is exactly the same,
308309
// 2) end timestamp is exactly the same,
@@ -312,30 +313,50 @@ public static String deduplicateContent(final String subtitleContent) {
312313
// This is a normalized comparison (trimmed and whitespace-normalized).
313314
// No semantic analysis or fuzzy matching is performed.
314315

315-
if (stringIsNullOrEmpty(subtitleContent)) {
316-
return subtitleContent;
316+
if (stringIsNullOrEmpty(ttmlFileContent)) {
317+
return ttmlFileContent;
317318
}
318319

319-
final Matcher matcher = getTtmlMatcher(subtitleContent);
320-
321-
final Set<String> seen = new HashSet<>();
320+
final Set<String> processedKeys = new HashSet<>();
322321
final StringBuilder result = new StringBuilder();
323322

324-
int lastIndex = 0;
325-
while (matcher.find()) {
326-
result.append(subtitleContent, lastIndex, matcher.start());
323+
// Create a matcher for all <p>...</p> entries
324+
final Matcher matcher = getTtmlMatcher(ttmlFileContent);
327325

328-
final String key = getSubtitleKeyOfTtml(matcher);
326+
// Keep track of the end index of the last processed <p>
327+
int lastParagraphEndIndex = 0;
329328

330-
if (!seen.contains(key)) {
331-
result.append(matcher.group(0));
332-
seen.add(key);
329+
while (matcher.find()) {
330+
// Extract the gap between the previous <p> and the current <p>
331+
// - it may contain whitespace, newlines, or other XML elements.
332+
// - it is NOT part of the subtitle paragraph.
333+
// - It is never used for deduplication or screen display.
334+
final String gapBetweenParagraphs = ttmlFileContent.substring(
335+
lastParagraphEndIndex,
336+
matcher.start()
337+
);
338+
result.append(gapBetweenParagraphs);
339+
340+
final String currentParagraph = matcher.group(0);
341+
final String currentParagraphKey = buildDeduplicationKey(matcher);
342+
343+
if (!processedKeys.contains(currentParagraphKey)) {
344+
// Append the ORIGINAL full <p> paragraph.
345+
// - This preserves the author's original formatting
346+
// (runs of whitespace, <br>, etc.).
347+
result.append(currentParagraph);
348+
processedKeys.add(currentParagraphKey);
333349
}
334350

335-
lastIndex = matcher.end();
351+
// Move the last processed index to the end of the current <p>
352+
lastParagraphEndIndex = matcher.end();
336353
}
337354

338-
result.append(subtitleContent.substring(lastIndex));
355+
// Append any remaining content after the last <p>.
356+
// - Usually contains closing tags like </div>, </body>, </tt>.
357+
final String trailingContent = ttmlFileContent.substring(lastParagraphEndIndex);
358+
result.append(trailingContent);
359+
339360
return result.toString();
340361
}
341362

@@ -363,7 +384,18 @@ private static Matcher getTtmlMatcher(final String subtitleContent) {
363384
return pattern.matcher(subtitleContent);
364385
}
365386

366-
private static String getSubtitleKeyOfTtml(final Matcher matcher) {
387+
/**
388+
* Generates a deduplication key for one TTML {@code <p>} paragraph.
389+
*
390+
* @param matcher Matcher already positioned on a single {@code <p>} element.
391+
* group(1) = begin time
392+
* group(2) = end time
393+
* group(3) = raw textual content (may contain 'span' tags)
394+
* @return a deduplication key composed of begin/end timestamps
395+
* and normalized text, used to detect whether this subtitle entry
396+
* has already been processed.
397+
*/
398+
private static String buildDeduplicationKey(final Matcher matcher) {
367399
final String begin = matcher.group(1).trim();
368400
final String end = matcher.group(2).trim();
369401

@@ -456,22 +488,20 @@ private static String buildLocalFileUri(final File subtitleCacheFile) {
456488
return path;
457489
}
458490

459-
private static String storeItToCacheDir(final String subtitleContent,
491+
private static String writeContentToCacheFile(final String subtitleContent,
460492
final MediaFormat format,
461493
final SubtitleOrigin currentSubtitleOrigin,
462494
final File currentCacheFile) {
463-
final File cacheFile = currentCacheFile;
495+
final String cacheFilePathForExoplayer = buildLocalFileUri(currentCacheFile);
464496

465-
final String cacheFilePathForExoplayer = buildLocalFileUri(cacheFile);
466-
467-
if (!ensureItsParentDirExist(cacheFile)) {
497+
if (!ensureItsParentDirExist(currentCacheFile)) {
468498
return null;
469499
}
470500

471-
if (null == writeDeduplicatedContentToCachefile(subtitleContent, cacheFile)) {
501+
if (null == writeContentToFile(subtitleContent, currentCacheFile)) {
472502
return cacheFilePathForExoplayer;
473503
} else {
474-
Log.e(TAG, "Failed to write cache file: " + cacheFile.getAbsolutePath());
504+
Log.e(TAG, "Failed to write cache file: " + currentCacheFile.getAbsolutePath());
475505
return null;
476506
}
477507
}
@@ -526,9 +556,7 @@ private static String buildSubtitleCacheFilename(
526556
}
527557

528558
private static String getLanguageCode(final String remoteSubtitleUrl) {
529-
String languageCode = null;
530-
languageCode = YoutubeParsingHelper.extractLanguageCode(remoteSubtitleUrl);
531-
return languageCode;
559+
return YoutubeParsingHelper.extractLanguageCode(remoteSubtitleUrl);
532560
}
533561

534562
private static String getAutoTranslateLanguage(final String remoteSubtitleUrl) {
@@ -653,12 +681,6 @@ private static boolean ensureItsParentDirExist(final File tempCacheFile) {
653681
}
654682
}
655683

656-
private static String writeDeduplicatedContentToCachefile(
657-
final String subtitleContent,
658-
final File tempCacheFile) {
659-
return writeContentToFile(subtitleContent, tempCacheFile);
660-
}
661-
662684
private static String writeContentToFile(final String content,
663685
final File tempFile) {
664686
try (BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(

0 commit comments

Comments
 (0)