Skip to content

Commit e38d906

Browse files
committed
Fix timestamp links in Youtube video descriptions
For some reason, in NewPipeExtractor, comments were loaded from JSON by YoutubeCommentsInfoItemExtractor as text, sent via CommentsInfoItem#getCommentText to NewPipe, where timestamps are converted to hyperlinks using Linkify: TeamNewPipe/NewPipe#2168 On the other hand, video descriptions are handled in NewPipeExtractor by scraping the watch-page HTML. There, timestamp links were previously mangled (and now properly parsed), before being sent as HTML via YoutubeStreamExtractor#getDescription to NewPipe (where HTML gets converted to Spanned). The logic introduced in this commit is different from the above PR, since it operates in the extractor, and mutates the HTML DOM rather than identifying via regex.
1 parent 430da57 commit e38d906

1 file changed

Lines changed: 44 additions & 2 deletions

File tree

extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/extractors/YoutubeStreamExtractor.java

Lines changed: 44 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,8 @@
3030
import java.net.MalformedURLException;
3131
import java.net.URL;
3232
import java.util.*;
33+
import java.util.regex.Matcher;
34+
import java.util.regex.Pattern;
3335

3436
/*
3537
* Created by Christian Schabesberger on 06.08.15.
@@ -162,14 +164,54 @@ public String getDescription() throws ParsingException {
162164
}
163165
}
164166

167+
// onclick="yt.www.watch.player.seekTo(0*3600+00*60+00);return false;"
168+
// :00 is NOT recognized as a timestamp in description or comments.
169+
// 0:00 is recognized in both description and comments.
170+
// https://www.youtube.com/watch?v=4cccfDXu1vA
171+
private final static Pattern DESCRIPTION_TIMESTAMP_ONCLICK_REGEX = Pattern.compile(
172+
"seekTo\\("
173+
+ "(?:(\\d+)\\*3600\\+)?" // hours?
174+
+ "(\\d+)\\*60\\+" // minutes
175+
+ "(\\d+)" // seconds
176+
+ "\\)");
177+
178+
@SafeVarargs
179+
private static <T> T coalesce(T... args) {
180+
for (T arg : args) {
181+
if (arg != null) return arg;
182+
}
183+
throw new IllegalArgumentException("all arguments to coalesce() were null");
184+
}
185+
165186
private String parseHtmlAndGetFullLinks(String descriptionHtml)
166187
throws MalformedURLException, UnsupportedEncodingException, ParsingException {
167188
final Document description = Jsoup.parse(descriptionHtml, getUrl());
168189
for(Element a : description.select("a")) {
169190
final String rawUrl = a.attr("abs:href");
170191
final URL redirectLink = new URL(rawUrl);
171-
final String queryString = redirectLink.getQuery();
172-
if(queryString != null) {
192+
193+
final Matcher onClickTimestamp;
194+
final String queryString;
195+
if ((onClickTimestamp = DESCRIPTION_TIMESTAMP_ONCLICK_REGEX.matcher(a.attr("onclick")))
196+
.find()) {
197+
a.removeAttr("onclick");
198+
199+
String hours = coalesce(onClickTimestamp.group(1), "0");
200+
String minutes = onClickTimestamp.group(2);
201+
String seconds = onClickTimestamp.group(3);
202+
203+
int timestamp = 0;
204+
timestamp += Integer.parseInt(hours) * 3600;
205+
timestamp += Integer.parseInt(minutes) * 60;
206+
timestamp += Integer.parseInt(seconds);
207+
208+
String setTimestamp = "&t=" + timestamp;
209+
210+
// Even after clicking https://youtu.be/...?t=6,
211+
// getUrl() is https://www.youtube.com/watch?v=..., never youtu.be, never &t=.
212+
a.attr("href", getUrl() + setTimestamp);
213+
214+
} else if((queryString = redirectLink.getQuery()) != null) {
173215
// if the query string is null we are not dealing with a redirect link,
174216
// so we don't need to override it.
175217
final String link =

0 commit comments

Comments
 (0)