forked from TeamNewPipe/NewPipe
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathSrtFromTtmlWriter.java
More file actions
127 lines (108 loc) · 4.16 KB
/
SrtFromTtmlWriter.java
File metadata and controls
127 lines (108 loc) · 4.16 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
package org.schabi.newpipe.streams;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.nodes.TextNode;
import org.jsoup.parser.Parser;
import org.jsoup.select.Elements;
import org.schabi.newpipe.streams.io.SharpStream;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
/**
* @author kapodamy
*/
public class SrtFromTtmlWriter {
private static final String NEW_LINE = "\r\n";
private final SharpStream out;
private final boolean ignoreEmptyFrames;
private final Charset charset = StandardCharsets.UTF_8;
private int frameIndex = 0;
public SrtFromTtmlWriter(final SharpStream out, final boolean ignoreEmptyFrames) {
this.out = out;
this.ignoreEmptyFrames = ignoreEmptyFrames;
}
private static String getTimestamp(final Element frame, final String attr) {
return frame
.attr(attr)
.replace('.', ','); // SRT subtitles uses comma as decimal separator
}
private void writeFrame(final String begin, final String end, final StringBuilder text)
throws IOException {
writeString(String.valueOf(frameIndex++));
writeString(NEW_LINE);
writeString(begin);
writeString(" --> ");
writeString(end);
writeString(NEW_LINE);
writeString(text.toString());
writeString(NEW_LINE);
writeString(NEW_LINE);
}
private void writeString(final String text) throws IOException {
out.write(text.getBytes(charset));
}
// CHECKSTYLE:OFF checkstyle:JavadocStyle
// checkstyle does not understand that span tags are inside a code block
/**
* <p>Recursive method to extract text from all nodes.</p>
* <p>
* This method processes {@link TextNode}s and {@code <br>} tags,
* recursively extracting text from nested tags
* (e.g. extracting text from nested {@code <span>} tags).
* Newlines are added for {@code <br>} tags.
* </p>
* @param node the current node to process
* @param text the {@link StringBuilder} to append the extracted text to
*/
private void extractText(final Node node, final StringBuilder text) {
if (node instanceof TextNode textNode) {
text.append((textNode).text());
} else if (node instanceof Element element) {
// <br> is a self-closing HTML tag used to insert a line break.
if (element.tagName().equalsIgnoreCase("br")) {
// Add a newline for <br> tags
text.append(NEW_LINE);
}
}
// Recursively process child nodes
for (final Node child : node.childNodes()) {
extractText(child, text);
}
}
// CHECKSTYLE:ON
public void build(final SharpStream ttml) throws IOException {
/*
* TTML parser with BASIC support
* multiple CUE is not supported
* styling is not supported
* tag timestamps (in auto-generated subtitles) are not supported, maybe in the future
* also TimestampTagOption enum is not applicable
* Language parsing is not supported
*/
// parse XML
final byte[] buffer = new byte[(int) ttml.available()];
ttml.read(buffer);
final Document doc = Jsoup.parse(new ByteArrayInputStream(buffer), "UTF-8", "",
Parser.xmlParser());
final StringBuilder text = new StringBuilder(128);
final Elements paragraphList = doc.select("body > div > p");
// check if has frames
if (paragraphList.isEmpty()) {
return;
}
for (final Element paragraph : paragraphList) {
text.setLength(0);
// Recursively extract text from all child nodes
extractText(paragraph, text);
if (ignoreEmptyFrames && text.length() < 1) {
continue;
}
final String begin = getTimestamp(paragraph, "begin");
final String end = getTimestamp(paragraph, "end");
writeFrame(begin, end, text);
}
}
}