Skip to content

Commit 09732d6

Browse files
StypoxAudricV
authored andcommitted
[YouTube] Add support for styles in attributed descriptions
Also refactor descriptions parsing.
1 parent 293c3e9 commit 09732d6

4 files changed

Lines changed: 259 additions & 84 deletions

File tree

Lines changed: 255 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,255 @@
1+
package org.schabi.newpipe.extractor.services.youtube;
2+
3+
import static org.schabi.newpipe.extractor.services.youtube.YoutubeParsingHelper.getUrlFromNavigationEndpoint;
4+
import static org.schabi.newpipe.extractor.utils.Utils.isNullOrEmpty;
5+
6+
import com.grack.nanojson.JsonObject;
7+
8+
import org.jsoup.nodes.Entities;
9+
10+
import java.util.ArrayList;
11+
import java.util.Collections;
12+
import java.util.Comparator;
13+
import java.util.List;
14+
import java.util.Stack;
15+
16+
import javax.annotation.Nonnull;
17+
import javax.annotation.Nullable;
18+
19+
public final class YoutubeDescriptionHelper {
20+
21+
private YoutubeDescriptionHelper() {
22+
}
23+
24+
public static final String LINK_CLOSE = "</a>";
25+
public static final String STRIKETHROUGH_OPEN = "<s>";
26+
public static final String STRIKETHROUGH_CLOSE = "</s>";
27+
public static final String BOLD_OPEN = "<b>";
28+
public static final String BOLD_CLOSE = "</b>";
29+
public static final String ITALIC_OPEN = "<i>";
30+
public static final String ITALIC_CLOSE = "</i>";
31+
32+
/**
33+
* Can be a command run, or a style run.
34+
*/
35+
static final class Run {
36+
@Nonnull final String open;
37+
@Nonnull final String close;
38+
final int pos;
39+
final boolean isClose;
40+
41+
Run(
42+
@Nonnull final String open,
43+
@Nonnull final String close,
44+
final int pos,
45+
final boolean isClose
46+
) {
47+
this.open = open;
48+
this.close = close;
49+
this.pos = pos;
50+
this.isClose = isClose;
51+
}
52+
53+
public boolean sameOpen(@Nonnull final Run other) {
54+
return open.equals(other.open);
55+
}
56+
}
57+
58+
/**
59+
* Parse a video description in the new "attributed" format, which contains the entire visible
60+
* plaintext ({@code content}) and an array of {@code commandRuns}.
61+
*
62+
* <p>
63+
* The {@code commandRuns} include the links and their position in the text.
64+
* </p>
65+
*
66+
* @param attributedDescription the JSON object of the attributed description
67+
* @return the parsed description, in HTML format, as a string
68+
*/
69+
public static String attributedDescriptionToHtml(
70+
@Nullable final JsonObject attributedDescription
71+
) {
72+
if (isNullOrEmpty(attributedDescription)) {
73+
return null;
74+
}
75+
76+
final String content = attributedDescription.getString("content");
77+
if (content == null) {
78+
return null;
79+
}
80+
81+
// all run pairs must always of length at least 1, or they should be discarded,
82+
// otherwise various assumptions made in runsToHtml may fail
83+
final List<Run> openers = new ArrayList<>();
84+
final List<Run> closers = new ArrayList<>();
85+
addAllCommandRuns(attributedDescription, openers, closers);
86+
addAllStyleRuns(attributedDescription, openers, closers);
87+
88+
// Note that sorting this way might put closers with the same close position in the wrong
89+
// order with respect to their openers, causing unnecessary closes and reopens. E.g.
90+
// <b>b<i>b&i</i></b> is instead generated as <b>b<i>b&i</b></i><b></b> if the </b> is
91+
// encountered before the </i>. Solving this wouldn't be difficult, thanks to stable sort,
92+
// but would require additional sorting steps which would just make this slower for the
93+
// general case where it's unlikely there are coincident closes.
94+
Collections.sort(openers, Comparator.comparingInt(run -> run.pos));
95+
Collections.sort(closers, Comparator.comparingInt(run -> run.pos));
96+
97+
return runsToHtml(openers, closers, content);
98+
}
99+
100+
/**
101+
* Applies the formatting specified by the intervals stored in {@code openers} and {@code
102+
* closers} to {@code content} in order to obtain valid HTML even when intervals overlap. For
103+
* example &lt;b&gt;b&lt;i&gt;b&i&lt;/b&gt;i&lt;/i&gt; would not be valid HTML, so this function
104+
* instead generates &lt;b&gt;b&lt;i&gt;b&i&lt;/i&gt;&lt;/b&gt;&lt;i&gt;i&lt;/i&gt;.
105+
* <p>
106+
* Every opener in {@code openers} must have a corresponding closer in {@code closers}. Every
107+
* corresponding (opener, closer) pair must have a length of at least one (i.e. empty intervals
108+
* are not allowed).
109+
* </p>
110+
*
111+
* @param openers contains all of the places where a run begins, must have the same size of
112+
* closers, must be ordered by {@link Run#pos}
113+
* @param closers contains all of the places where a run ends, must have the same size of
114+
* openers, must be ordered by {@link Run#pos}
115+
* @param content the content to apply formatting to
116+
* @return the formatted content in HTML
117+
*/
118+
static String runsToHtml(
119+
@Nonnull final List<Run> openers,
120+
@Nonnull final List<Run> closers,
121+
@Nonnull final String content
122+
) {
123+
final Stack<Run> openRuns = new Stack<>();
124+
final Stack<Run> tempStack = new Stack<>();
125+
final StringBuilder textBuilder = new StringBuilder();
126+
int currentTextPos = 0;
127+
int openersIndex = 0;
128+
int closersIndex = 0;
129+
130+
// openers and closers have the same length, but we will surely finish openers earlier than
131+
// closers, since every opened interval needs to be closed at some point and there can't be
132+
// empty intervals, hence check only closersIndex < closers.size()
133+
while (closersIndex < closers.size()) {
134+
final int minPos = openersIndex < openers.size()
135+
? Math.min(closers.get(closersIndex).pos, openers.get(openersIndex).pos)
136+
: closers.get(closersIndex).pos;
137+
138+
// append piece of text until current index
139+
textBuilder.append(content, currentTextPos, minPos);
140+
currentTextPos = minPos;
141+
142+
if (closers.get(closersIndex).pos == minPos) {
143+
// even in case of position tie, first process closers
144+
final Run closer = closers.get(closersIndex);
145+
++closersIndex;
146+
147+
// because of the assumptions, this while wouldn't need the !openRuns.empty()
148+
// condition, because no run will close before being opened, but let's be sure
149+
while (!openRuns.empty()) {
150+
final Run popped = openRuns.pop();
151+
textBuilder.append(popped.close);
152+
if (popped.sameOpen(closer)) {
153+
break;
154+
}
155+
// we keep popping from openRuns, closing all of the runs we find,
156+
// until we find the run that we really need to close ...
157+
tempStack.push(popped);
158+
}
159+
while (!tempStack.empty()) {
160+
// ... and then we reopen all of the runs that we didn't need to close
161+
// e.g. in <b>b<i>b&i</b>i</i>, when </b> is encountered, </i></b><i> is printed
162+
// instead, to make sure the HTML is valid, obtaining <b>b<i>b&i</i></b><i>i</i>
163+
final Run popped = tempStack.pop();
164+
textBuilder.append(popped.open);
165+
openRuns.push(popped);
166+
}
167+
168+
} else {
169+
// this will never be reached if openersIndex >= openers.size() because of the
170+
// way minPos is calculated
171+
textBuilder.append(openers.get(openersIndex).open);
172+
openRuns.push(openers.get(openersIndex));
173+
++openersIndex;
174+
}
175+
}
176+
177+
// append last piece of text
178+
textBuilder.append(content, currentTextPos, content.length());
179+
180+
return textBuilder.toString()
181+
.replace("\n", "<br>")
182+
.replace(" ", " &nbsp;")
183+
// special link chips (e.g. for YT videos, YT channels or social media accounts):
184+
// u00a0 u00a0 [/•] u00a0 <link content> u00a0 u00a0
185+
.replace("\">\u00a0\u00a0/\u00a0", "\">")
186+
.replace("\">\u00a0\u00a0\u00a0", "\">")
187+
.replace("\u00a0\u00a0</a>", "</a>");
188+
}
189+
190+
private static void addAllCommandRuns(
191+
@Nonnull final JsonObject attributedDescription,
192+
@Nonnull final List<Run> openers,
193+
@Nonnull final List<Run> closers
194+
) {
195+
attributedDescription.getArray("commandRuns")
196+
.stream()
197+
.filter(JsonObject.class::isInstance)
198+
.map(JsonObject.class::cast)
199+
.forEach(run -> {
200+
final JsonObject navigationEndpoint = run.getObject("onTap")
201+
.getObject("innertubeCommand");
202+
203+
final int startIndex = run.getInt("startIndex", -1);
204+
final int length = run.getInt("length", 0);
205+
if (startIndex < 0 || length < 1 || navigationEndpoint == null) {
206+
return;
207+
}
208+
209+
final String url = getUrlFromNavigationEndpoint(navigationEndpoint);
210+
if (url == null) {
211+
return;
212+
}
213+
214+
final String open = "<a href=\"" + Entities.escape(url) + "\">";
215+
216+
openers.add(new Run(open, LINK_CLOSE, startIndex, false));
217+
closers.add(new Run(open, LINK_CLOSE, startIndex + length, true));
218+
});
219+
}
220+
221+
private static void addAllStyleRuns(
222+
@Nonnull final JsonObject attributedDescription,
223+
@Nonnull final List<Run> openers,
224+
@Nonnull final List<Run> closers
225+
) {
226+
attributedDescription.getArray("styleRuns")
227+
.stream()
228+
.filter(JsonObject.class::isInstance)
229+
.map(JsonObject.class::cast)
230+
.forEach(run -> {
231+
final int start = run.getInt("startIndex", -1);
232+
final int length = run.getInt("length", 0);
233+
if (start < 0 || length < 1) {
234+
return;
235+
}
236+
final int end = start + length;
237+
238+
if (run.has("strikethrough")) {
239+
openers.add(new Run(STRIKETHROUGH_OPEN, STRIKETHROUGH_CLOSE, start, false));
240+
closers.add(new Run(STRIKETHROUGH_OPEN, STRIKETHROUGH_CLOSE, end, true));
241+
}
242+
243+
if (run.getBoolean("italic", false)) {
244+
openers.add(new Run(ITALIC_OPEN, ITALIC_CLOSE, start, false));
245+
closers.add(new Run(ITALIC_OPEN, ITALIC_CLOSE, end, true));
246+
}
247+
248+
if (run.has("weightLabel")
249+
&& !"FONT_WEIGHT_NORMAL".equals(run.getString("weightLabel"))) {
250+
openers.add(new Run(BOLD_OPEN, BOLD_CLOSE, start, false));
251+
closers.add(new Run(BOLD_OPEN, BOLD_CLOSE, end, true));
252+
}
253+
});
254+
}
255+
}

extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/YoutubeParsingHelper.java

Lines changed: 0 additions & 80 deletions
Original file line numberDiff line numberDiff line change
@@ -996,86 +996,6 @@ public static String getTextFromObject(final JsonObject textObject, final boolea
996996
return text;
997997
}
998998

999-
/**
1000-
* Parse a video description in the new "attributed" format, which contains the entire visible
1001-
* plaintext ({@code content}) and an array of {@code commandRuns}.
1002-
*
1003-
* <p>
1004-
* The {@code commandRuns} include the links and their position in the text.
1005-
* </p>
1006-
*
1007-
* @param attributedDescription the JSON object of the attributed description
1008-
* @return the parsed description, in HTML format, as a string
1009-
*/
1010-
@Nullable
1011-
public static String getAttributedDescription(
1012-
@Nullable final JsonObject attributedDescription) {
1013-
if (isNullOrEmpty(attributedDescription)) {
1014-
return null;
1015-
}
1016-
1017-
final String content = attributedDescription.getString("content");
1018-
if (content == null) {
1019-
return null;
1020-
}
1021-
1022-
final JsonArray commandRuns = attributedDescription.getArray("commandRuns");
1023-
1024-
final StringBuilder textBuilder = new StringBuilder();
1025-
int textStart = 0;
1026-
1027-
for (final Object commandRun : commandRuns) {
1028-
if (!(commandRun instanceof JsonObject)) {
1029-
continue;
1030-
}
1031-
1032-
final JsonObject run = ((JsonObject) commandRun);
1033-
final int startIndex = run.getInt("startIndex", -1);
1034-
final int length = run.getInt("length");
1035-
final JsonObject navigationEndpoint = run.getObject("onTap")
1036-
.getObject("innertubeCommand");
1037-
1038-
if (startIndex < 0 || length < 1 || navigationEndpoint == null) {
1039-
continue;
1040-
}
1041-
1042-
final String url = getUrlFromNavigationEndpoint(navigationEndpoint);
1043-
1044-
if (url == null) {
1045-
continue;
1046-
}
1047-
1048-
// Append text before the link
1049-
if (startIndex > textStart) {
1050-
textBuilder.append(content, textStart, startIndex);
1051-
}
1052-
1053-
// Trim and append link text
1054-
// Channel/Video format: 3xu00a0, (/ •), u00a0, <Name>, 2xu00a0
1055-
final String linkText = content.substring(startIndex, startIndex + length)
1056-
.replace('\u00a0', ' ')
1057-
.trim()
1058-
.replaceFirst("^[/•] *", "");
1059-
1060-
textBuilder.append("<a href=\"")
1061-
.append(Entities.escape(url))
1062-
.append("\">")
1063-
.append(Entities.escape(linkText))
1064-
.append("</a>");
1065-
1066-
textStart = startIndex + length;
1067-
}
1068-
1069-
// Append the remaining text
1070-
if (textStart < content.length()) {
1071-
textBuilder.append(content.substring(textStart));
1072-
}
1073-
1074-
return textBuilder.toString()
1075-
.replaceAll("\\n", "<br>")
1076-
.replaceAll(" {2}", " &nbsp;");
1077-
}
1078-
1079999
@Nonnull
10801000
public static String getTextFromObjectOrThrow(final JsonObject textObject, final String error)
10811001
throws ParsingException {

extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/extractors/YoutubeCommentsEUVMInfoItemExtractor.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
import java.util.List;
1616
import java.util.Objects;
1717

18-
import static org.schabi.newpipe.extractor.services.youtube.YoutubeParsingHelper.getAttributedDescription;
18+
import static org.schabi.newpipe.extractor.services.youtube.YoutubeDescriptionHelper.attributedDescriptionToHtml;
1919
import static org.schabi.newpipe.extractor.services.youtube.YoutubeParsingHelper.getImagesFromThumbnailsArray;
2020
import static org.schabi.newpipe.extractor.utils.Utils.isNullOrEmpty;
2121

@@ -97,7 +97,7 @@ public String getTextualLikeCount() {
9797
public Description getCommentText() throws ParsingException {
9898
// Comments' text work in the same way as an attributed video description
9999
return new Description(
100-
getAttributedDescription(commentEntityPayload.getObject(PROPERTIES)
100+
attributedDescriptionToHtml(commentEntityPayload.getObject(PROPERTIES)
101101
.getObject("content")), Description.HTML);
102102
}
103103

extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/extractors/YoutubeStreamExtractor.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222

2323
import static org.schabi.newpipe.extractor.services.youtube.ItagItem.APPROX_DURATION_MS_UNKNOWN;
2424
import static org.schabi.newpipe.extractor.services.youtube.ItagItem.CONTENT_LENGTH_UNKNOWN;
25+
import static org.schabi.newpipe.extractor.services.youtube.YoutubeDescriptionHelper.attributedDescriptionToHtml;
2526
import static org.schabi.newpipe.extractor.services.youtube.YoutubeParsingHelper.CONTENT_CHECK_OK;
2627
import static org.schabi.newpipe.extractor.services.youtube.YoutubeParsingHelper.CPN;
2728
import static org.schabi.newpipe.extractor.services.youtube.YoutubeParsingHelper.RACY_CHECK_OK;
@@ -30,7 +31,6 @@
3031
import static org.schabi.newpipe.extractor.services.youtube.YoutubeParsingHelper.fixThumbnailUrl;
3132
import static org.schabi.newpipe.extractor.services.youtube.YoutubeParsingHelper.generateContentPlaybackNonce;
3233
import static org.schabi.newpipe.extractor.services.youtube.YoutubeParsingHelper.generateTParameter;
33-
import static org.schabi.newpipe.extractor.services.youtube.YoutubeParsingHelper.getAttributedDescription;
3434
import static org.schabi.newpipe.extractor.services.youtube.YoutubeParsingHelper.getImagesFromThumbnailsArray;
3535
import static org.schabi.newpipe.extractor.services.youtube.YoutubeParsingHelper.getJsonAndroidPostResponse;
3636
import static org.schabi.newpipe.extractor.services.youtube.YoutubeParsingHelper.getJsonIosPostResponse;
@@ -261,7 +261,7 @@ public Description getDescription() throws ParsingException {
261261
return new Description(videoSecondaryInfoRendererDescription, Description.HTML);
262262
}
263263

264-
final String attributedDescription = getAttributedDescription(
264+
final String attributedDescription = attributedDescriptionToHtml(
265265
getVideoSecondaryInfoRenderer().getObject("attributedDescription"));
266266
if (!isNullOrEmpty(attributedDescription)) {
267267
return new Description(attributedDescription, Description.HTML);

0 commit comments

Comments
 (0)