Skip to content

Commit 1556adb

Browse files
committed
[YouTube] Fix hashtags links extraction and escape text in attribute descriptions + HTML links
webCommandMetadata object is contained inside a commandMetadata one, so it is not accessible from the root of the navigationEndpoint object. The corresponding statement has been moved at the bottom of the specific endpoints parsing, as the webCommandMetadata object is present almost everywhere, otherwise URLs of some endpoints would have be changed, such as uploader URLs (from channel IDs to handles). As no ParsingException is now thrown by getUrlFromNavigationEndpoint, and so by getTextFromObject, getUrlFromObject and getTextAtKey, the methods which were catching ParsingExceptions thrown by these methods had to be updated. URLs got in the HTML version of getTextFromObject are now escaped properly to provide valid HTML to clients. This has been also done for attribute descriptions, with the description text for this type of descriptions. As YouTube descriptions are in HTML format (except for the fallback on the JSON player response, which is plain text and only happens when there is no visual metadata or a breaking change), all URLs returned are escaped, so tests which are testing presence of URLs with escaped characters had to be updated (it was only the case for YoutubeStreamExtractorDefaultTest.DescriptionTestUnboxing).
1 parent 99ab977 commit 1556adb

4 files changed

Lines changed: 64 additions & 80 deletions

File tree

extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/YoutubeParsingHelper.java

Lines changed: 42 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -822,7 +822,7 @@ public static String[] getYoutubeMusicKey()
822822

823823
try {
824824
final String url = "https://music.youtube.com/sw.js";
825-
final var headers = getOriginReferrerHeaders("https://music.youtube.com");
825+
final var headers = getOriginReferrerHeaders(YOUTUBE_MUSIC_URL);
826826
final String response = getDownloader().get(url, headers).responseBody();
827827
musicClientVersion = getStringResultFromRegexArray(response,
828828
INNERTUBE_CONTEXT_CLIENT_VERSION_REGEXES, 1);
@@ -843,18 +843,11 @@ public static String[] getYoutubeMusicKey()
843843
}
844844

845845
@Nullable
846-
public static String getUrlFromNavigationEndpoint(@Nonnull final JsonObject navigationEndpoint)
847-
throws ParsingException {
848-
if (navigationEndpoint.has("webCommandMetadata")) {
849-
// this case needs to be handled before the browseEndpoint,
850-
// e.g. for hashtags in comments
851-
final JsonObject metadata = navigationEndpoint.getObject("webCommandMetadata");
852-
if (metadata.has("url")) {
853-
return "https://www.youtube.com" + metadata.getString("url");
854-
}
855-
}
846+
public static String getUrlFromNavigationEndpoint(
847+
@Nonnull final JsonObject navigationEndpoint) {
856848
if (navigationEndpoint.has("urlEndpoint")) {
857-
String internUrl = navigationEndpoint.getObject("urlEndpoint").getString("url");
849+
String internUrl = navigationEndpoint.getObject("urlEndpoint")
850+
.getString("url");
858851
if (internUrl.startsWith("https://www.youtube.com/redirect?")) {
859852
// remove https://www.youtube.com part to fall in the next if block
860853
internUrl = internUrl.substring(23);
@@ -879,7 +872,9 @@ public static String getUrlFromNavigationEndpoint(@Nonnull final JsonObject navi
879872
|| internUrl.startsWith("/watch")) {
880873
return "https://www.youtube.com" + internUrl;
881874
}
882-
} else if (navigationEndpoint.has("browseEndpoint")) {
875+
}
876+
877+
if (navigationEndpoint.has("browseEndpoint")) {
883878
final JsonObject browseEndpoint = navigationEndpoint.getObject("browseEndpoint");
884879
final String canonicalBaseUrl = browseEndpoint.getString("canonicalBaseUrl");
885880
final String browseId = browseEndpoint.getString("browseId");
@@ -892,26 +887,39 @@ public static String getUrlFromNavigationEndpoint(@Nonnull final JsonObject navi
892887
if (!isNullOrEmpty(canonicalBaseUrl)) {
893888
return "https://www.youtube.com" + canonicalBaseUrl;
894889
}
890+
}
895891

896-
throw new ParsingException("canonicalBaseUrl is null and browseId is not a channel (\""
897-
+ browseEndpoint + "\")");
898-
} else if (navigationEndpoint.has("watchEndpoint")) {
892+
if (navigationEndpoint.has("watchEndpoint")) {
899893
final StringBuilder url = new StringBuilder();
900-
url.append("https://www.youtube.com/watch?v=").append(navigationEndpoint
901-
.getObject("watchEndpoint").getString(VIDEO_ID));
894+
url.append("https://www.youtube.com/watch?v=")
895+
.append(navigationEndpoint.getObject("watchEndpoint")
896+
.getString(VIDEO_ID));
902897
if (navigationEndpoint.getObject("watchEndpoint").has("playlistId")) {
903898
url.append("&list=").append(navigationEndpoint.getObject("watchEndpoint")
904899
.getString("playlistId"));
905900
}
906901
if (navigationEndpoint.getObject("watchEndpoint").has("startTimeSeconds")) {
907-
url.append("&t=").append(navigationEndpoint.getObject("watchEndpoint")
902+
url.append("&t=")
903+
.append(navigationEndpoint.getObject("watchEndpoint")
908904
.getInt("startTimeSeconds"));
909905
}
910906
return url.toString();
911-
} else if (navigationEndpoint.has("watchPlaylistEndpoint")) {
907+
}
908+
909+
if (navigationEndpoint.has("watchPlaylistEndpoint")) {
912910
return "https://www.youtube.com/playlist?list="
913-
+ navigationEndpoint.getObject("watchPlaylistEndpoint").getString("playlistId");
911+
+ navigationEndpoint.getObject("watchPlaylistEndpoint")
912+
.getString("playlistId");
914913
}
914+
915+
if (navigationEndpoint.has("commandMetadata")) {
916+
final JsonObject metadata = navigationEndpoint.getObject("commandMetadata")
917+
.getObject("webCommandMetadata");
918+
if (metadata.has("url")) {
919+
return "https://www.youtube.com" + metadata.getString("url");
920+
}
921+
}
922+
915923
return null;
916924
}
917925

@@ -924,8 +932,7 @@ public static String getUrlFromNavigationEndpoint(@Nonnull final JsonObject navi
924932
* @return text in the JSON object or {@code null}
925933
*/
926934
@Nullable
927-
public static String getTextFromObject(final JsonObject textObject, final boolean html)
928-
throws ParsingException {
935+
public static String getTextFromObject(final JsonObject textObject, final boolean html) {
929936
if (isNullOrEmpty(textObject)) {
930937
return null;
931938
}
@@ -944,12 +951,12 @@ public static String getTextFromObject(final JsonObject textObject, final boolea
944951
String text = run.getString("text");
945952

946953
if (html) {
947-
text = Entities.escape(text);
948954
if (run.has("navigationEndpoint")) {
949-
final String url = getUrlFromNavigationEndpoint(run
950-
.getObject("navigationEndpoint"));
955+
final String url = getUrlFromNavigationEndpoint(
956+
run.getObject("navigationEndpoint"));
951957
if (!isNullOrEmpty(url)) {
952-
text = "<a href=\"" + url + "\">" + text + "</a>";
958+
text = "<a href=\"" + Entities.escape(url) + "\">" + Entities.escape(text)
959+
+ "</a>";
953960
}
954961
}
955962

@@ -1015,11 +1022,12 @@ public static String getAttributedDescription(
10151022
}
10161023

10171024
final String content = attributedDescription.getString("content");
1018-
final JsonArray commandRuns = attributedDescription.getArray("commandRuns");
10191025
if (content == null) {
10201026
return null;
10211027
}
10221028

1029+
final JsonArray commandRuns = attributedDescription.getArray("commandRuns");
1030+
10231031
final StringBuilder textBuilder = new StringBuilder();
10241032
int textStart = 0;
10251033

@@ -1038,12 +1046,7 @@ public static String getAttributedDescription(
10381046
continue;
10391047
}
10401048

1041-
final String url;
1042-
try {
1043-
url = getUrlFromNavigationEndpoint(navigationEndpoint);
1044-
} catch (final ParsingException e) {
1045-
continue;
1046-
}
1049+
final String url = getUrlFromNavigationEndpoint(navigationEndpoint);
10471050

10481051
if (url == null) {
10491052
continue;
@@ -1062,9 +1065,9 @@ public static String getAttributedDescription(
10621065
.replaceFirst("^[/•] *", "");
10631066

10641067
textBuilder.append("<a href=\"")
1065-
.append(url)
1068+
.append(Entities.escape(url))
10661069
.append("\">")
1067-
.append(linkText)
1070+
.append(Entities.escape(linkText))
10681071
.append("</a>");
10691072

10701073
textStart = startIndex + length;
@@ -1081,13 +1084,12 @@ public static String getAttributedDescription(
10811084
}
10821085

10831086
@Nullable
1084-
public static String getTextFromObject(final JsonObject textObject) throws ParsingException {
1087+
public static String getTextFromObject(final JsonObject textObject) {
10851088
return getTextFromObject(textObject, false);
10861089
}
10871090

10881091
@Nullable
1089-
public static String getUrlFromObject(final JsonObject textObject) throws ParsingException {
1090-
1092+
public static String getUrlFromObject(final JsonObject textObject) {
10911093
if (isNullOrEmpty(textObject)) {
10921094
return null;
10931095
}
@@ -1108,8 +1110,7 @@ public static String getUrlFromObject(final JsonObject textObject) throws Parsin
11081110
}
11091111

11101112
@Nullable
1111-
public static String getTextAtKey(@Nonnull final JsonObject jsonObject, final String theKey)
1112-
throws ParsingException {
1113+
public static String getTextAtKey(@Nonnull final JsonObject jsonObject, final String theKey) {
11131114
if (jsonObject.isString(theKey)) {
11141115
return jsonObject.getString(theKey);
11151116
} else {

extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/extractors/YoutubeChannelInfoItemExtractor.java

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -45,13 +45,10 @@ public YoutubeChannelInfoItemExtractor(final JsonObject channelInfoItem) {
4545
this.channelInfoItem = channelInfoItem;
4646

4747
boolean wHandle = false;
48-
try {
49-
final String subscriberCountText = getTextFromObject(
50-
channelInfoItem.getObject("subscriberCountText"));
51-
if (subscriberCountText != null) {
52-
wHandle = subscriberCountText.startsWith("@");
53-
}
54-
} catch (final ParsingException ignored) {
48+
final String subscriberCountText = getTextFromObject(
49+
channelInfoItem.getObject("subscriberCountText"));
50+
if (subscriberCountText != null) {
51+
wHandle = subscriberCountText.startsWith("@");
5552
}
5653
this.withHandle = wHandle;
5754
}

extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/extractors/YoutubeStreamExtractor.java

Lines changed: 14 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -168,11 +168,7 @@ public String getName() throws ParsingException {
168168
title = playerResponse.getObject("videoDetails").getString("title");
169169

170170
if (isNullOrEmpty(title)) {
171-
try {
172-
title = getTextFromObject(getVideoPrimaryInfoRenderer().getObject("title"));
173-
} catch (final ParsingException ignored) {
174-
// Age-restricted videos cause a ParsingException here
175-
}
171+
title = getTextFromObject(getVideoPrimaryInfoRenderer().getObject("title"));
176172

177173
if (isNullOrEmpty(title)) {
178174
throw new ParsingException("Could not get name");
@@ -285,21 +281,17 @@ public String getThumbnailUrl() throws ParsingException {
285281
public Description getDescription() throws ParsingException {
286282
assertPageFetched();
287283
// Description with more info on links
288-
try {
289-
final String description = getTextFromObject(
290-
getVideoSecondaryInfoRenderer().getObject("description"),
291-
true);
292-
if (!isNullOrEmpty(description)) {
293-
return new Description(description, Description.HTML);
294-
}
284+
final String videoSecondaryInfoRendererDescription = getTextFromObject(
285+
getVideoSecondaryInfoRenderer().getObject("description"),
286+
true);
287+
if (!isNullOrEmpty(videoSecondaryInfoRendererDescription)) {
288+
return new Description(videoSecondaryInfoRendererDescription, Description.HTML);
289+
}
295290

296-
final String attributedDescription = getAttributedDescription(
297-
getVideoSecondaryInfoRenderer().getObject("attributedDescription"));
298-
if (!isNullOrEmpty(attributedDescription)) {
299-
return new Description(attributedDescription, Description.HTML);
300-
}
301-
} catch (final ParsingException ignored) {
302-
// Age-restricted videos cause a ParsingException here
291+
final String attributedDescription = getAttributedDescription(
292+
getVideoSecondaryInfoRenderer().getObject("attributedDescription"));
293+
if (!isNullOrEmpty(attributedDescription)) {
294+
return new Description(attributedDescription, Description.HTML);
303295
}
304296

305297
String description = playerResponse.getObject("videoDetails")
@@ -400,14 +392,8 @@ public long getTimeStamp() throws ParsingException {
400392

401393
@Override
402394
public long getViewCount() throws ParsingException {
403-
String views = null;
404-
405-
try {
406-
views = getTextFromObject(getVideoPrimaryInfoRenderer().getObject("viewCount")
407-
.getObject("videoViewCountRenderer").getObject("viewCount"));
408-
} catch (final ParsingException ignored) {
409-
// Age-restricted videos cause a ParsingException here
410-
}
395+
String views = getTextFromObject(getVideoPrimaryInfoRenderer().getObject("viewCount")
396+
.getObject("videoViewCountRenderer").getObject("viewCount"));
411397

412398
if (isNullOrEmpty(views)) {
413399
views = playerResponse.getObject("videoDetails").getString("viewCount");
@@ -795,7 +781,7 @@ public String getErrorMessage() {
795781
return getTextFromObject(playerResponse.getObject("playabilityStatus")
796782
.getObject("errorScreen").getObject("playerErrorMessageRenderer")
797783
.getObject("reason"));
798-
} catch (final ParsingException | NullPointerException e) {
784+
} catch (final NullPointerException e) {
799785
return null; // No error message
800786
}
801787
}

extractor/src/test/java/org/schabi/newpipe/extractor/services/youtube/stream/YoutubeStreamExtractorDefaultTest.java

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -183,10 +183,10 @@ public static void setUp() throws Exception {
183183
@Override public String expectedUploaderUrl() { return "https://www.youtube.com/channel/UCsTcErHg8oDvUnTzoqsYeNw"; }
184184
@Override public long expectedUploaderSubscriberCountAtLeast() { return 18_000_000; }
185185
@Override public List<String> expectedDescriptionContains() {
186-
return Arrays.asList("https://www.youtube.com/watch?v=X7FLCHVXpsA&list=PL7u4lWXQ3wfI_7PgX0C-VTiwLeu0S4v34",
187-
"https://www.youtube.com/watch?v=Lqv6G0pDNnw&list=PL7u4lWXQ3wfI_7PgX0C-VTiwLeu0S4v34",
188-
"https://www.youtube.com/watch?v=XxaRBPyrnBU&list=PL7u4lWXQ3wfI_7PgX0C-VTiwLeu0S4v34",
189-
"https://www.youtube.com/watch?v=U-9tUEOFKNU&list=PL7u4lWXQ3wfI_7PgX0C-VTiwLeu0S4v34");
186+
return Arrays.asList("https://www.youtube.com/watch?v=X7FLCHVXpsA&amp;list=PL7u4lWXQ3wfI_7PgX0C-VTiwLeu0S4v34",
187+
"https://www.youtube.com/watch?v=Lqv6G0pDNnw&amp;list=PL7u4lWXQ3wfI_7PgX0C-VTiwLeu0S4v34",
188+
"https://www.youtube.com/watch?v=XxaRBPyrnBU&amp;list=PL7u4lWXQ3wfI_7PgX0C-VTiwLeu0S4v34",
189+
"https://www.youtube.com/watch?v=U-9tUEOFKNU&amp;list=PL7u4lWXQ3wfI_7PgX0C-VTiwLeu0S4v34");
190190
}
191191
@Override public long expectedLength() { return 434; }
192192
@Override public long expectedViewCountAtLeast() { return 21229200; }

0 commit comments

Comments
 (0)