Skip to content

Commit 6c3c2e2

Browse files
authored
Merge pull request #1163 from AudricV/yt-fix_comments_extraction
[YouTube] Support new comments data
2 parents e5b30ae + 02274d5 commit 6c3c2e2

8 files changed

Lines changed: 779 additions & 154 deletions

File tree

Lines changed: 316 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,316 @@
1+
package org.schabi.newpipe.extractor.services.youtube;
2+
3+
import static org.schabi.newpipe.extractor.services.youtube.YoutubeParsingHelper.getUrlFromNavigationEndpoint;
4+
import static org.schabi.newpipe.extractor.utils.Utils.isNullOrEmpty;
5+
6+
import com.grack.nanojson.JsonObject;
7+
8+
import org.jsoup.nodes.Entities;
9+
10+
import java.util.ArrayList;
11+
import java.util.Collections;
12+
import java.util.Comparator;
13+
import java.util.List;
14+
import java.util.Stack;
15+
import java.util.function.Function;
16+
import java.util.regex.Matcher;
17+
import java.util.regex.Pattern;
18+
19+
import javax.annotation.Nonnull;
20+
import javax.annotation.Nullable;
21+
22+
public final class YoutubeDescriptionHelper {
23+
24+
private YoutubeDescriptionHelper() {
25+
}
26+
27+
private static final String LINK_CLOSE = "</a>";
28+
private static final String STRIKETHROUGH_OPEN = "<s>";
29+
private static final String STRIKETHROUGH_CLOSE = "</s>";
30+
private static final String BOLD_OPEN = "<b>";
31+
private static final String BOLD_CLOSE = "</b>";
32+
private static final String ITALIC_OPEN = "<i>";
33+
private static final String ITALIC_CLOSE = "</i>";
34+
35+
// special link chips (e.g. for YT videos, YT channels or social media accounts):
36+
// (u00a0) u00a0 u00a0 [/•] u00a0 <link content> u00a0 u00a0
37+
private static final Pattern LINK_CONTENT_CLEANER_REGEX
38+
= Pattern.compile("(?s)^ +[/•] +(.*?) +$");
39+
40+
/**
41+
* Can be a command run, or a style run.
42+
*/
43+
static final class Run {
44+
@Nonnull final String open;
45+
@Nonnull final String close;
46+
final int pos;
47+
@Nullable final Function<String, String> transformContent;
48+
int openPosInOutput = -1;
49+
50+
Run(
51+
@Nonnull final String open,
52+
@Nonnull final String close,
53+
final int pos
54+
) {
55+
this(open, close, pos, null);
56+
}
57+
58+
Run(
59+
@Nonnull final String open,
60+
@Nonnull final String close,
61+
final int pos,
62+
@Nullable final Function<String, String> transformContent
63+
) {
64+
this.open = open;
65+
this.close = close;
66+
this.pos = pos;
67+
this.transformContent = transformContent;
68+
}
69+
70+
public boolean sameOpen(@Nonnull final Run other) {
71+
return open.equals(other.open);
72+
}
73+
}
74+
75+
/**
76+
* Parse a video description in the new "attributed" format, which contains the entire visible
77+
* plaintext ({@code content}) and an array of {@code commandRuns} and {@code styleRuns}.
78+
* Returns the formatted content in HTML format, and escapes the text to make sure there are no
79+
* XSS attacks.
80+
*
81+
* <p>
82+
* {@code commandRuns} include the links and their range in the text, while {@code styleRuns}
83+
* include the styling to apply to various ranges in the text.
84+
* </p>
85+
*
86+
* @param attributedDescription the JSON object of the attributed description
87+
* @return the parsed description, in HTML format, as a string
88+
*/
89+
@Nullable
90+
public static String attributedDescriptionToHtml(
91+
@Nullable final JsonObject attributedDescription
92+
) {
93+
if (isNullOrEmpty(attributedDescription)) {
94+
return null;
95+
}
96+
97+
final String content = attributedDescription.getString("content");
98+
if (content == null) {
99+
return null;
100+
}
101+
102+
// all run pairs must always of length at least 1, or they should be discarded,
103+
// otherwise various assumptions made in runsToHtml may fail
104+
final List<Run> openers = new ArrayList<>();
105+
final List<Run> closers = new ArrayList<>();
106+
addAllCommandRuns(attributedDescription, openers, closers);
107+
addAllStyleRuns(attributedDescription, openers, closers);
108+
109+
// Note that sorting this way might put closers with the same close position in the wrong
110+
// order with respect to their openers, causing unnecessary closes and reopens. E.g.
111+
// <b>b<i>b&i</i></b> is instead generated as <b>b<i>b&i</b></i><b></b> if the </b> is
112+
// encountered before the </i>. Solving this wouldn't be difficult, thanks to stable sort,
113+
// but would require additional sorting steps which would just make this slower for the
114+
// general case where it's unlikely there are coincident closes.
115+
Collections.sort(openers, Comparator.comparingInt(run -> run.pos));
116+
Collections.sort(closers, Comparator.comparingInt(run -> run.pos));
117+
118+
return runsToHtml(openers, closers, content);
119+
}
120+
121+
/**
122+
* Applies the formatting specified by the intervals stored in {@code openers} and {@code
123+
* closers} to {@code content} in order to obtain valid HTML even when intervals overlap. For
124+
* example &lt;b&gt;b&lt;i&gt;b&i&lt;/b&gt;i&lt;/i&gt; would not be valid HTML, so this function
125+
* instead generates &lt;b&gt;b&lt;i&gt;b&i&lt;/i&gt;&lt;/b&gt;&lt;i&gt;i&lt;/i&gt;. Any HTML
126+
* special characters in {@code rawContent} are escaped to make sure there are no XSS attacks.
127+
*
128+
* <p>
129+
* Every opener in {@code openers} must have a corresponding closer in {@code closers}. Every
130+
* corresponding (opener, closer) pair must have a length of at least one (i.e. empty intervals
131+
* are not allowed).
132+
* </p>
133+
*
134+
* @param openers contains all of the places where a run begins, must have the same size of
135+
* closers, must be ordered by {@link Run#pos}
136+
* @param closers contains all of the places where a run ends, must have the same size of
137+
* openers, must be ordered by {@link Run#pos}
138+
* @param rawContent the content to apply formatting to, and to escape to avoid XSS
139+
* @return the formatted content in HTML
140+
*/
141+
static String runsToHtml(
142+
@Nonnull final List<Run> openers,
143+
@Nonnull final List<Run> closers,
144+
@Nonnull final String rawContent
145+
) {
146+
final String content = rawContent.replace('\u00a0', ' ');
147+
final Stack<Run> openRuns = new Stack<>();
148+
final Stack<Run> tempStack = new Stack<>();
149+
final StringBuilder textBuilder = new StringBuilder();
150+
int currentTextPos = 0;
151+
int openersIndex = 0;
152+
int closersIndex = 0;
153+
154+
// openers and closers have the same length, but we will surely finish openers earlier than
155+
// closers, since every opened interval needs to be closed at some point and there can't be
156+
// empty intervals, hence check only closersIndex < closers.size()
157+
while (closersIndex < closers.size()) {
158+
final int minPos = openersIndex < openers.size()
159+
? Math.min(closers.get(closersIndex).pos, openers.get(openersIndex).pos)
160+
: closers.get(closersIndex).pos;
161+
162+
// append piece of text until current index
163+
textBuilder.append(Entities.escape(content.substring(currentTextPos, minPos)));
164+
currentTextPos = minPos;
165+
166+
if (closers.get(closersIndex).pos == minPos) {
167+
// even in case of position tie, first process closers
168+
final Run closer = closers.get(closersIndex);
169+
++closersIndex;
170+
171+
// because of the assumptions, this while wouldn't need the !openRuns.empty()
172+
// condition, because no run will close before being opened, but let's be sure
173+
while (!openRuns.empty()) {
174+
final Run popped = openRuns.pop();
175+
if (popped.sameOpen(closer)) {
176+
// before closing the current run, if the run has a transformContent
177+
// function, use it to transform the content of the current run, based on
178+
// the openPosInOutput set when the current run was opened
179+
if (popped.transformContent != null && popped.openPosInOutput >= 0) {
180+
textBuilder.replace(popped.openPosInOutput, textBuilder.length(),
181+
popped.transformContent.apply(
182+
textBuilder.substring(popped.openPosInOutput)));
183+
}
184+
// close the run that we really need to close
185+
textBuilder.append(popped.close);
186+
break;
187+
}
188+
// we keep popping from openRuns, closing all of the runs we find,
189+
// until we find the run that we really need to close ...
190+
textBuilder.append(popped.close);
191+
tempStack.push(popped);
192+
}
193+
while (!tempStack.empty()) {
194+
// ... and then we reopen all of the runs that we didn't need to close
195+
// e.g. in <b>b<i>b&i</b>i</i>, when </b> is encountered, </i></b><i> is printed
196+
// instead, to make sure the HTML is valid, obtaining <b>b<i>b&i</i></b><i>i</i>
197+
final Run popped = tempStack.pop();
198+
textBuilder.append(popped.open);
199+
openRuns.push(popped);
200+
}
201+
202+
} else {
203+
// this will never be reached if openersIndex >= openers.size() because of the
204+
// way minPos is calculated
205+
final Run opener = openers.get(openersIndex);
206+
textBuilder.append(opener.open);
207+
opener.openPosInOutput = textBuilder.length(); // save for transforming later
208+
openRuns.push(opener);
209+
++openersIndex;
210+
}
211+
}
212+
213+
// append last piece of text
214+
textBuilder.append(Entities.escape(content.substring(currentTextPos)));
215+
216+
return textBuilder.toString()
217+
.replace("\n", "<br>")
218+
.replace(" ", " &nbsp;");
219+
}
220+
221+
private static void addAllCommandRuns(
222+
@Nonnull final JsonObject attributedDescription,
223+
@Nonnull final List<Run> openers,
224+
@Nonnull final List<Run> closers
225+
) {
226+
attributedDescription.getArray("commandRuns")
227+
.stream()
228+
.filter(JsonObject.class::isInstance)
229+
.map(JsonObject.class::cast)
230+
.forEach(run -> {
231+
final JsonObject navigationEndpoint = run.getObject("onTap")
232+
.getObject("innertubeCommand");
233+
234+
final int startIndex = run.getInt("startIndex", -1);
235+
final int length = run.getInt("length", 0);
236+
if (startIndex < 0 || length < 1 || navigationEndpoint == null) {
237+
return;
238+
}
239+
240+
final String url = getUrlFromNavigationEndpoint(navigationEndpoint);
241+
if (url == null) {
242+
return;
243+
}
244+
245+
final String open = "<a href=\"" + Entities.escape(url) + "\">";
246+
final Function<String, String> transformContent = getTransformContentFun(run);
247+
248+
openers.add(new Run(open, LINK_CLOSE, startIndex, transformContent));
249+
closers.add(new Run(open, LINK_CLOSE, startIndex + length, transformContent));
250+
});
251+
}
252+
253+
private static Function<String, String> getTransformContentFun(final JsonObject run) {
254+
final String accessibilityLabel = run.getObject("onTapOptions")
255+
.getObject("accessibilityInfo")
256+
.getString("accessibilityLabel", "")
257+
// accessibility labels are e.g. "Instagram Channel Link: instagram_profile_name"
258+
.replaceFirst(" Channel Link", "");
259+
260+
final Function<String, String> transformContent;
261+
if (accessibilityLabel.isEmpty() || accessibilityLabel.startsWith("YouTube: ")) {
262+
// if there is no accessibility label, or the link points to YouTube, cleanup the link
263+
// text, see LINK_CONTENT_CLEANER_REGEX's documentation for more details
264+
transformContent = (content) -> {
265+
final Matcher m = LINK_CONTENT_CLEANER_REGEX.matcher(content);
266+
if (m.find()) {
267+
return m.group(1);
268+
}
269+
return content;
270+
};
271+
} else {
272+
// if there is an accessibility label, replace the link text with it, because on the
273+
// YouTube website an ambiguous link text is next to an icon explaining which service it
274+
// belongs to, but since we can't add icons, we instead use the accessibility label
275+
// which contains information about the service
276+
transformContent = (content) -> accessibilityLabel;
277+
}
278+
279+
return transformContent;
280+
}
281+
282+
private static void addAllStyleRuns(
283+
@Nonnull final JsonObject attributedDescription,
284+
@Nonnull final List<Run> openers,
285+
@Nonnull final List<Run> closers
286+
) {
287+
attributedDescription.getArray("styleRuns")
288+
.stream()
289+
.filter(JsonObject.class::isInstance)
290+
.map(JsonObject.class::cast)
291+
.forEach(run -> {
292+
final int start = run.getInt("startIndex", -1);
293+
final int length = run.getInt("length", 0);
294+
if (start < 0 || length < 1) {
295+
return;
296+
}
297+
final int end = start + length;
298+
299+
if (run.has("strikethrough")) {
300+
openers.add(new Run(STRIKETHROUGH_OPEN, STRIKETHROUGH_CLOSE, start));
301+
closers.add(new Run(STRIKETHROUGH_OPEN, STRIKETHROUGH_CLOSE, end));
302+
}
303+
304+
if (run.getBoolean("italic", false)) {
305+
openers.add(new Run(ITALIC_OPEN, ITALIC_CLOSE, start));
306+
closers.add(new Run(ITALIC_OPEN, ITALIC_CLOSE, end));
307+
}
308+
309+
if (run.has("weightLabel")
310+
&& !"FONT_WEIGHT_NORMAL".equals(run.getString("weightLabel"))) {
311+
openers.add(new Run(BOLD_OPEN, BOLD_CLOSE, start));
312+
closers.add(new Run(BOLD_OPEN, BOLD_CLOSE, end));
313+
}
314+
});
315+
}
316+
}

0 commit comments

Comments
 (0)