Skip to content

Commit 19e4b21

Browse files
authored
Merge pull request #1032 from AudricV/yt_fix-comments-hashtags-links-extraction
[YouTube] Fix hashtags links extraction and escape HTML links
2 parents b129849 + a63f289 commit 19e4b21

14 files changed

Lines changed: 314 additions & 761 deletions

File tree

extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/YoutubeParsingHelper.java

Lines changed: 42 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -822,7 +822,7 @@ public static String[] getYoutubeMusicKey()
822822

823823
try {
824824
final String url = "https://music.youtube.com/sw.js";
825-
final var headers = getOriginReferrerHeaders("https://music.youtube.com");
825+
final var headers = getOriginReferrerHeaders(YOUTUBE_MUSIC_URL);
826826
final String response = getDownloader().get(url, headers).responseBody();
827827
musicClientVersion = getStringResultFromRegexArray(response,
828828
INNERTUBE_CONTEXT_CLIENT_VERSION_REGEXES, 1);
@@ -843,18 +843,11 @@ public static String[] getYoutubeMusicKey()
843843
}
844844

845845
@Nullable
846-
public static String getUrlFromNavigationEndpoint(@Nonnull final JsonObject navigationEndpoint)
847-
throws ParsingException {
848-
if (navigationEndpoint.has("webCommandMetadata")) {
849-
// this case needs to be handled before the browseEndpoint,
850-
// e.g. for hashtags in comments
851-
final JsonObject metadata = navigationEndpoint.getObject("webCommandMetadata");
852-
if (metadata.has("url")) {
853-
return "https://www.youtube.com" + metadata.getString("url");
854-
}
855-
}
846+
public static String getUrlFromNavigationEndpoint(
847+
@Nonnull final JsonObject navigationEndpoint) {
856848
if (navigationEndpoint.has("urlEndpoint")) {
857-
String internUrl = navigationEndpoint.getObject("urlEndpoint").getString("url");
849+
String internUrl = navigationEndpoint.getObject("urlEndpoint")
850+
.getString("url");
858851
if (internUrl.startsWith("https://www.youtube.com/redirect?")) {
859852
// remove https://www.youtube.com part to fall in the next if block
860853
internUrl = internUrl.substring(23);
@@ -879,7 +872,9 @@ public static String getUrlFromNavigationEndpoint(@Nonnull final JsonObject navi
879872
|| internUrl.startsWith("/watch")) {
880873
return "https://www.youtube.com" + internUrl;
881874
}
882-
} else if (navigationEndpoint.has("browseEndpoint")) {
875+
}
876+
877+
if (navigationEndpoint.has("browseEndpoint")) {
883878
final JsonObject browseEndpoint = navigationEndpoint.getObject("browseEndpoint");
884879
final String canonicalBaseUrl = browseEndpoint.getString("canonicalBaseUrl");
885880
final String browseId = browseEndpoint.getString("browseId");
@@ -892,26 +887,39 @@ public static String getUrlFromNavigationEndpoint(@Nonnull final JsonObject navi
892887
if (!isNullOrEmpty(canonicalBaseUrl)) {
893888
return "https://www.youtube.com" + canonicalBaseUrl;
894889
}
890+
}
895891

896-
throw new ParsingException("canonicalBaseUrl is null and browseId is not a channel (\""
897-
+ browseEndpoint + "\")");
898-
} else if (navigationEndpoint.has("watchEndpoint")) {
892+
if (navigationEndpoint.has("watchEndpoint")) {
899893
final StringBuilder url = new StringBuilder();
900-
url.append("https://www.youtube.com/watch?v=").append(navigationEndpoint
901-
.getObject("watchEndpoint").getString(VIDEO_ID));
894+
url.append("https://www.youtube.com/watch?v=")
895+
.append(navigationEndpoint.getObject("watchEndpoint")
896+
.getString(VIDEO_ID));
902897
if (navigationEndpoint.getObject("watchEndpoint").has("playlistId")) {
903898
url.append("&list=").append(navigationEndpoint.getObject("watchEndpoint")
904899
.getString("playlistId"));
905900
}
906901
if (navigationEndpoint.getObject("watchEndpoint").has("startTimeSeconds")) {
907-
url.append("&t=").append(navigationEndpoint.getObject("watchEndpoint")
902+
url.append("&t=")
903+
.append(navigationEndpoint.getObject("watchEndpoint")
908904
.getInt("startTimeSeconds"));
909905
}
910906
return url.toString();
911-
} else if (navigationEndpoint.has("watchPlaylistEndpoint")) {
907+
}
908+
909+
if (navigationEndpoint.has("watchPlaylistEndpoint")) {
912910
return "https://www.youtube.com/playlist?list="
913-
+ navigationEndpoint.getObject("watchPlaylistEndpoint").getString("playlistId");
911+
+ navigationEndpoint.getObject("watchPlaylistEndpoint")
912+
.getString("playlistId");
914913
}
914+
915+
if (navigationEndpoint.has("commandMetadata")) {
916+
final JsonObject metadata = navigationEndpoint.getObject("commandMetadata")
917+
.getObject("webCommandMetadata");
918+
if (metadata.has("url")) {
919+
return "https://www.youtube.com" + metadata.getString("url");
920+
}
921+
}
922+
915923
return null;
916924
}
917925

@@ -924,8 +932,7 @@ public static String getUrlFromNavigationEndpoint(@Nonnull final JsonObject navi
924932
* @return text in the JSON object or {@code null}
925933
*/
926934
@Nullable
927-
public static String getTextFromObject(final JsonObject textObject, final boolean html)
928-
throws ParsingException {
935+
public static String getTextFromObject(final JsonObject textObject, final boolean html) {
929936
if (isNullOrEmpty(textObject)) {
930937
return null;
931938
}
@@ -944,12 +951,12 @@ public static String getTextFromObject(final JsonObject textObject, final boolea
944951
String text = run.getString("text");
945952

946953
if (html) {
947-
text = Entities.escape(text);
948954
if (run.has("navigationEndpoint")) {
949-
final String url = getUrlFromNavigationEndpoint(run
950-
.getObject("navigationEndpoint"));
955+
final String url = getUrlFromNavigationEndpoint(
956+
run.getObject("navigationEndpoint"));
951957
if (!isNullOrEmpty(url)) {
952-
text = "<a href=\"" + url + "\">" + text + "</a>";
958+
text = "<a href=\"" + Entities.escape(url) + "\">" + Entities.escape(text)
959+
+ "</a>";
953960
}
954961
}
955962

@@ -1015,11 +1022,12 @@ public static String getAttributedDescription(
10151022
}
10161023

10171024
final String content = attributedDescription.getString("content");
1018-
final JsonArray commandRuns = attributedDescription.getArray("commandRuns");
10191025
if (content == null) {
10201026
return null;
10211027
}
10221028

1029+
final JsonArray commandRuns = attributedDescription.getArray("commandRuns");
1030+
10231031
final StringBuilder textBuilder = new StringBuilder();
10241032
int textStart = 0;
10251033

@@ -1038,12 +1046,7 @@ public static String getAttributedDescription(
10381046
continue;
10391047
}
10401048

1041-
final String url;
1042-
try {
1043-
url = getUrlFromNavigationEndpoint(navigationEndpoint);
1044-
} catch (final ParsingException e) {
1045-
continue;
1046-
}
1049+
final String url = getUrlFromNavigationEndpoint(navigationEndpoint);
10471050

10481051
if (url == null) {
10491052
continue;
@@ -1062,9 +1065,9 @@ public static String getAttributedDescription(
10621065
.replaceFirst("^[/•] *", "");
10631066

10641067
textBuilder.append("<a href=\"")
1065-
.append(url)
1068+
.append(Entities.escape(url))
10661069
.append("\">")
1067-
.append(linkText)
1070+
.append(Entities.escape(linkText))
10681071
.append("</a>");
10691072

10701073
textStart = startIndex + length;
@@ -1081,13 +1084,12 @@ public static String getAttributedDescription(
10811084
}
10821085

10831086
@Nullable
1084-
public static String getTextFromObject(final JsonObject textObject) throws ParsingException {
1087+
public static String getTextFromObject(final JsonObject textObject) {
10851088
return getTextFromObject(textObject, false);
10861089
}
10871090

10881091
@Nullable
1089-
public static String getUrlFromObject(final JsonObject textObject) throws ParsingException {
1090-
1092+
public static String getUrlFromObject(final JsonObject textObject) {
10911093
if (isNullOrEmpty(textObject)) {
10921094
return null;
10931095
}
@@ -1108,8 +1110,7 @@ public static String getUrlFromObject(final JsonObject textObject) throws Parsin
11081110
}
11091111

11101112
@Nullable
1111-
public static String getTextAtKey(@Nonnull final JsonObject jsonObject, final String theKey)
1112-
throws ParsingException {
1113+
public static String getTextAtKey(@Nonnull final JsonObject jsonObject, final String theKey) {
11131114
if (jsonObject.isString(theKey)) {
11141115
return jsonObject.getString(theKey);
11151116
} else {

extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/extractors/YoutubeChannelInfoItemExtractor.java

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -45,13 +45,10 @@ public YoutubeChannelInfoItemExtractor(final JsonObject channelInfoItem) {
4545
this.channelInfoItem = channelInfoItem;
4646

4747
boolean wHandle = false;
48-
try {
49-
final String subscriberCountText = getTextFromObject(
50-
channelInfoItem.getObject("subscriberCountText"));
51-
if (subscriberCountText != null) {
52-
wHandle = subscriberCountText.startsWith("@");
53-
}
54-
} catch (final ParsingException ignored) {
48+
final String subscriberCountText = getTextFromObject(
49+
channelInfoItem.getObject("subscriberCountText"));
50+
if (subscriberCountText != null) {
51+
wHandle = subscriberCountText.startsWith("@");
5552
}
5653
this.withHandle = wHandle;
5754
}

extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/extractors/YoutubeStreamExtractor.java

Lines changed: 14 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -168,11 +168,7 @@ public String getName() throws ParsingException {
168168
title = playerResponse.getObject("videoDetails").getString("title");
169169

170170
if (isNullOrEmpty(title)) {
171-
try {
172-
title = getTextFromObject(getVideoPrimaryInfoRenderer().getObject("title"));
173-
} catch (final ParsingException ignored) {
174-
// Age-restricted videos cause a ParsingException here
175-
}
171+
title = getTextFromObject(getVideoPrimaryInfoRenderer().getObject("title"));
176172

177173
if (isNullOrEmpty(title)) {
178174
throw new ParsingException("Could not get name");
@@ -285,21 +281,17 @@ public String getThumbnailUrl() throws ParsingException {
285281
public Description getDescription() throws ParsingException {
286282
assertPageFetched();
287283
// Description with more info on links
288-
try {
289-
final String description = getTextFromObject(
290-
getVideoSecondaryInfoRenderer().getObject("description"),
291-
true);
292-
if (!isNullOrEmpty(description)) {
293-
return new Description(description, Description.HTML);
294-
}
284+
final String videoSecondaryInfoRendererDescription = getTextFromObject(
285+
getVideoSecondaryInfoRenderer().getObject("description"),
286+
true);
287+
if (!isNullOrEmpty(videoSecondaryInfoRendererDescription)) {
288+
return new Description(videoSecondaryInfoRendererDescription, Description.HTML);
289+
}
295290

296-
final String attributedDescription = getAttributedDescription(
297-
getVideoSecondaryInfoRenderer().getObject("attributedDescription"));
298-
if (!isNullOrEmpty(attributedDescription)) {
299-
return new Description(attributedDescription, Description.HTML);
300-
}
301-
} catch (final ParsingException ignored) {
302-
// Age-restricted videos cause a ParsingException here
291+
final String attributedDescription = getAttributedDescription(
292+
getVideoSecondaryInfoRenderer().getObject("attributedDescription"));
293+
if (!isNullOrEmpty(attributedDescription)) {
294+
return new Description(attributedDescription, Description.HTML);
303295
}
304296

305297
String description = playerResponse.getObject("videoDetails")
@@ -400,14 +392,8 @@ public long getTimeStamp() throws ParsingException {
400392

401393
@Override
402394
public long getViewCount() throws ParsingException {
403-
String views = null;
404-
405-
try {
406-
views = getTextFromObject(getVideoPrimaryInfoRenderer().getObject("viewCount")
407-
.getObject("videoViewCountRenderer").getObject("viewCount"));
408-
} catch (final ParsingException ignored) {
409-
// Age-restricted videos cause a ParsingException here
410-
}
395+
String views = getTextFromObject(getVideoPrimaryInfoRenderer().getObject("viewCount")
396+
.getObject("videoViewCountRenderer").getObject("viewCount"));
411397

412398
if (isNullOrEmpty(views)) {
413399
views = playerResponse.getObject("videoDetails").getString("viewCount");
@@ -795,7 +781,7 @@ public String getErrorMessage() {
795781
return getTextFromObject(playerResponse.getObject("playabilityStatus")
796782
.getObject("errorScreen").getObject("playerErrorMessageRenderer")
797783
.getObject("reason"));
798-
} catch (final ParsingException | NullPointerException e) {
784+
} catch (final NullPointerException e) {
799785
return null; // No error message
800786
}
801787
}

extractor/src/test/java/org/schabi/newpipe/extractor/services/youtube/stream/YoutubeStreamExtractorDefaultTest.java

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -183,10 +183,10 @@ public static void setUp() throws Exception {
183183
@Override public String expectedUploaderUrl() { return "https://www.youtube.com/channel/UCsTcErHg8oDvUnTzoqsYeNw"; }
184184
@Override public long expectedUploaderSubscriberCountAtLeast() { return 18_000_000; }
185185
@Override public List<String> expectedDescriptionContains() {
186-
return Arrays.asList("https://www.youtube.com/watch?v=X7FLCHVXpsA&list=PL7u4lWXQ3wfI_7PgX0C-VTiwLeu0S4v34",
187-
"https://www.youtube.com/watch?v=Lqv6G0pDNnw&list=PL7u4lWXQ3wfI_7PgX0C-VTiwLeu0S4v34",
188-
"https://www.youtube.com/watch?v=XxaRBPyrnBU&list=PL7u4lWXQ3wfI_7PgX0C-VTiwLeu0S4v34",
189-
"https://www.youtube.com/watch?v=U-9tUEOFKNU&list=PL7u4lWXQ3wfI_7PgX0C-VTiwLeu0S4v34");
186+
return Arrays.asList("https://www.youtube.com/watch?v=X7FLCHVXpsA&amp;list=PL7u4lWXQ3wfI_7PgX0C-VTiwLeu0S4v34",
187+
"https://www.youtube.com/watch?v=Lqv6G0pDNnw&amp;list=PL7u4lWXQ3wfI_7PgX0C-VTiwLeu0S4v34",
188+
"https://www.youtube.com/watch?v=XxaRBPyrnBU&amp;list=PL7u4lWXQ3wfI_7PgX0C-VTiwLeu0S4v34",
189+
"https://www.youtube.com/watch?v=U-9tUEOFKNU&amp;list=PL7u4lWXQ3wfI_7PgX0C-VTiwLeu0S4v34");
190190
}
191191
@Override public long expectedLength() { return 434; }
192192
@Override public long expectedViewCountAtLeast() { return 21229200; }

extractor/src/test/resources/org/schabi/newpipe/extractor/services/youtube/extractor/comments/formatting/generated_mock_0.json

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,10 @@
33
"httpMethod": "GET",
44
"url": "https://www.youtube.com/sw.js",
55
"headers": {
6-
"Origin": [
6+
"Referer": [
77
"https://www.youtube.com"
88
],
9-
"Referer": [
9+
"Origin": [
1010
"https://www.youtube.com"
1111
],
1212
"Accept-Language": [
@@ -29,7 +29,7 @@
2929
"https://www.youtube.com"
3030
],
3131
"alt-svc": [
32-
"h3\u003d\":443\"; ma\u003d2592000,h3-29\u003d\":443\"; ma\u003d2592000,h3-Q050\u003d\":443\"; ma\u003d2592000,h3-Q046\u003d\":443\"; ma\u003d2592000,h3-Q043\u003d\":443\"; ma\u003d2592000,quic\u003d\":443\"; ma\u003d2592000; v\u003d\"46,43\""
32+
"h3\u003d\":443\"; ma\u003d2592000,h3-29\u003d\":443\"; ma\u003d2592000"
3333
],
3434
"cache-control": [
3535
"private, max-age\u003d0"
@@ -41,10 +41,10 @@
4141
"same-origin; report-to\u003d\"youtube_main\""
4242
],
4343
"date": [
44-
"Mon, 28 Nov 2022 20:27:36 GMT"
44+
"Sun, 26 Feb 2023 17:48:54 GMT"
4545
],
4646
"expires": [
47-
"Mon, 28 Nov 2022 20:27:36 GMT"
47+
"Sun, 26 Feb 2023 17:48:54 GMT"
4848
],
4949
"p3p": [
5050
"CP\u003d\"This is not a P3P policy! See http://support.google.com/accounts/answer/151657?hl\u003den-GB for more info.\""
@@ -59,9 +59,9 @@
5959
"ESF"
6060
],
6161
"set-cookie": [
62-
"YSC\u003ddaTQ98V-voQ; Domain\u003d.youtube.com; Path\u003d/; Secure; HttpOnly; SameSite\u003dnone",
63-
"VISITOR_INFO1_LIVE\u003d; Domain\u003d.youtube.com; Expires\u003dTue, 03-Mar-2020 20:27:36 GMT; Path\u003d/; Secure; HttpOnly; SameSite\u003dnone",
64-
"CONSENT\u003dPENDING+452; expires\u003dWed, 27-Nov-2024 20:27:36 GMT; path\u003d/; domain\u003d.youtube.com; Secure"
62+
"YSC\u003dYJXWRWCYVkE; Domain\u003d.youtube.com; Path\u003d/; Secure; HttpOnly; SameSite\u003dnone",
63+
"VISITOR_INFO1_LIVE\u003d; Domain\u003d.youtube.com; Expires\u003dMon, 01-Jun-2020 17:48:54 GMT; Path\u003d/; Secure; HttpOnly; SameSite\u003dnone",
64+
"CONSENT\u003dPENDING+668; expires\u003dTue, 25-Feb-2025 17:48:54 GMT; path\u003d/; domain\u003d.youtube.com; Secure"
6565
],
6666
"strict-transport-security": [
6767
"max-age\u003d31536000"

extractor/src/test/resources/org/schabi/newpipe/extractor/services/youtube/extractor/comments/formatting/generated_mock_1.json

Lines changed: 6 additions & 6 deletions
Large diffs are not rendered by default.

0 commit comments

Comments
 (0)