From bc6de149527fabb235068bfed538194316054ddb Mon Sep 17 00:00:00 2001 From: TobiGr Date: Sun, 20 Dec 2020 19:54:12 +0100 Subject: [PATCH] Extract stream and search meta info for YouTube Add method to extract Google webcache URLs. --- .../schabi/newpipe/extractor/MetaInfo.java | 76 ++++++++++ .../extractor/search/SearchExtractor.java | 13 ++ .../newpipe/extractor/search/SearchInfo.java | 24 ++- .../extractors/MediaCCCSearchExtractor.java | 8 + .../extractors/MediaCCCStreamExtractor.java | 7 + .../extractors/PeertubeSearchExtractor.java | 9 ++ .../extractors/PeertubeStreamExtractor.java | 7 + .../extractors/SoundcloudSearchExtractor.java | 9 ++ .../extractors/SoundcloudStreamExtractor.java | 7 + .../youtube/YoutubeParsingHelper.java | 141 ++++++++++++++++-- .../YoutubeMusicSearchExtractor.java | 7 + .../extractors/YoutubeSearchExtractor.java | 14 +- .../extractors/YoutubeStreamExtractor.java | 12 ++ .../extractor/stream/StreamExtractor.java | 15 ++ .../newpipe/extractor/stream/StreamInfo.java | 23 ++- .../services/DefaultSearchExtractorTest.java | 42 ++++++ .../services/DefaultStreamExtractorTest.java | 36 +++++ .../youtube/YoutubeParsingHelperTest.java | 8 + .../search/YoutubeSearchExtractorTest.java | 49 +++++- .../YoutubeStreamExtractorDefaultTest.java | 48 ++++++ 20 files changed, 526 insertions(+), 29 deletions(-) create mode 100644 extractor/src/main/java/org/schabi/newpipe/extractor/MetaInfo.java diff --git a/extractor/src/main/java/org/schabi/newpipe/extractor/MetaInfo.java b/extractor/src/main/java/org/schabi/newpipe/extractor/MetaInfo.java new file mode 100644 index 000000000..da9928e87 --- /dev/null +++ b/extractor/src/main/java/org/schabi/newpipe/extractor/MetaInfo.java @@ -0,0 +1,76 @@ +package org.schabi.newpipe.extractor; + +import org.schabi.newpipe.extractor.stream.Description; + +import java.io.Serializable; +import java.net.URL; +import java.util.ArrayList; +import java.util.List; + +import javax.annotation.Nonnull; + +public class MetaInfo implements Serializable { + + private String title = ""; + private Description content; + private List urls = new ArrayList<>(); + private List urlTexts = new ArrayList<>(); + + public MetaInfo(@Nonnull final String title, @Nonnull final Description content, + @Nonnull final List urls, @Nonnull final List urlTexts) { + this.title = title; + this.content = content; + this.urls = urls; + this.urlTexts = urlTexts; + } + + public MetaInfo() { + } + + /** + * @return Title of the info. Can be empty. + */ + @Nonnull + public String getTitle() { + return title; + } + + public void setTitle(@Nonnull final String title) { + this.title = title; + } + + @Nonnull + public Description getContent() { + return content; + } + + public void setContent(@Nonnull final Description content) { + this.content = content; + } + + @Nonnull + public List getUrls() { + return urls; + } + + public void setUrls(@Nonnull final List urls) { + this.urls = urls; + } + + public void addUrl(@Nonnull final URL url) { + urls.add(url); + } + + @Nonnull + public List getUrlTexts() { + return urlTexts; + } + + public void setUrlTexts(@Nonnull final List urlTexts) { + this.urlTexts = urlTexts; + } + + public void addUrlText(@Nonnull final String urlText) { + urlTexts.add(urlText); + } +} diff --git a/extractor/src/main/java/org/schabi/newpipe/extractor/search/SearchExtractor.java b/extractor/src/main/java/org/schabi/newpipe/extractor/search/SearchExtractor.java index d1c481bb4..3539dc816 100644 --- a/extractor/src/main/java/org/schabi/newpipe/extractor/search/SearchExtractor.java +++ b/extractor/src/main/java/org/schabi/newpipe/extractor/search/SearchExtractor.java @@ -2,12 +2,14 @@ package org.schabi.newpipe.extractor.search; import org.schabi.newpipe.extractor.InfoItem; import org.schabi.newpipe.extractor.ListExtractor; +import org.schabi.newpipe.extractor.MetaInfo; import org.schabi.newpipe.extractor.StreamingService; import org.schabi.newpipe.extractor.exceptions.ExtractionException; import org.schabi.newpipe.extractor.exceptions.ParsingException; import org.schabi.newpipe.extractor.linkhandler.SearchQueryHandler; import javax.annotation.Nonnull; +import java.util.List; public abstract class SearchExtractor extends ListExtractor { @@ -57,4 +59,15 @@ public abstract class SearchExtractor extends ListExtractor { * @return whether the results comes from a corrected query or not. */ public abstract boolean isCorrectedSearch() throws ParsingException; + + /** + * Meta information about the search query. + *

+ * Example: on YouTube, if you search for "Covid-19", + * there is a box with information from the WHO about Covid-19 and a link to the WHO's website. + * @return additional meta information about the search query + * @throws ParsingException + */ + @Nonnull + public abstract List getMetaInfo() throws ParsingException; } diff --git a/extractor/src/main/java/org/schabi/newpipe/extractor/search/SearchInfo.java b/extractor/src/main/java/org/schabi/newpipe/extractor/search/SearchInfo.java index 8967b0a89..d1eb0b385 100644 --- a/extractor/src/main/java/org/schabi/newpipe/extractor/search/SearchInfo.java +++ b/extractor/src/main/java/org/schabi/newpipe/extractor/search/SearchInfo.java @@ -1,20 +1,20 @@ package org.schabi.newpipe.extractor.search; -import org.schabi.newpipe.extractor.InfoItem; -import org.schabi.newpipe.extractor.ListExtractor; -import org.schabi.newpipe.extractor.ListInfo; -import org.schabi.newpipe.extractor.Page; -import org.schabi.newpipe.extractor.StreamingService; +import org.schabi.newpipe.extractor.*; import org.schabi.newpipe.extractor.exceptions.ExtractionException; import org.schabi.newpipe.extractor.linkhandler.SearchQueryHandler; import org.schabi.newpipe.extractor.utils.ExtractorHelper; import java.io.IOException; +import java.util.List; + +import javax.annotation.Nonnull; public class SearchInfo extends ListInfo { private String searchString; private String searchSuggestion; private boolean isCorrectedSearch; + private List metaInfo; public SearchInfo(int serviceId, SearchQueryHandler qIHandler, @@ -51,6 +51,11 @@ public class SearchInfo extends ListInfo { } catch (Exception e) { info.addError(e); } + try { + info.setMetaInfo(extractor.getMetaInfo()); + } catch (Exception e) { + info.addError(e); + } ListExtractor.InfoItemsPage page = ExtractorHelper.getItemsPageOrLogError(info, extractor); info.setRelatedItems(page.getItems()); @@ -87,4 +92,13 @@ public class SearchInfo extends ListInfo { public void setSearchSuggestion(String searchSuggestion) { this.searchSuggestion = searchSuggestion; } + + @Nonnull + public List getMetaInfo() { + return metaInfo; + } + + public void setMetaInfo(@Nonnull List metaInfo) { + this.metaInfo = metaInfo; + } } diff --git a/extractor/src/main/java/org/schabi/newpipe/extractor/services/media_ccc/extractors/MediaCCCSearchExtractor.java b/extractor/src/main/java/org/schabi/newpipe/extractor/services/media_ccc/extractors/MediaCCCSearchExtractor.java index 676a89e8c..f2b30ed10 100644 --- a/extractor/src/main/java/org/schabi/newpipe/extractor/services/media_ccc/extractors/MediaCCCSearchExtractor.java +++ b/extractor/src/main/java/org/schabi/newpipe/extractor/services/media_ccc/extractors/MediaCCCSearchExtractor.java @@ -6,6 +6,7 @@ import com.grack.nanojson.JsonParser; import com.grack.nanojson.JsonParserException; import org.schabi.newpipe.extractor.InfoItem; +import org.schabi.newpipe.extractor.MetaInfo; import org.schabi.newpipe.extractor.Page; import org.schabi.newpipe.extractor.StreamingService; import org.schabi.newpipe.extractor.channel.ChannelInfoItem; @@ -20,6 +21,7 @@ import org.schabi.newpipe.extractor.services.media_ccc.extractors.infoItems.Medi import org.schabi.newpipe.extractor.services.media_ccc.linkHandler.MediaCCCConferencesListLinkHandlerFactory; import java.io.IOException; +import java.util.Collections; import java.util.List; import javax.annotation.Nonnull; @@ -55,6 +57,12 @@ public class MediaCCCSearchExtractor extends SearchExtractor { return false; } + @Nonnull + @Override + public List getMetaInfo() { + return Collections.emptyList(); + } + @Nonnull @Override public InfoItemsPage getInitialPage() { diff --git a/extractor/src/main/java/org/schabi/newpipe/extractor/services/media_ccc/extractors/MediaCCCStreamExtractor.java b/extractor/src/main/java/org/schabi/newpipe/extractor/services/media_ccc/extractors/MediaCCCStreamExtractor.java index 97236ac5c..092923809 100644 --- a/extractor/src/main/java/org/schabi/newpipe/extractor/services/media_ccc/extractors/MediaCCCStreamExtractor.java +++ b/extractor/src/main/java/org/schabi/newpipe/extractor/services/media_ccc/extractors/MediaCCCStreamExtractor.java @@ -6,6 +6,7 @@ import com.grack.nanojson.JsonParser; import com.grack.nanojson.JsonParserException; import org.schabi.newpipe.extractor.MediaFormat; +import org.schabi.newpipe.extractor.MetaInfo; import org.schabi.newpipe.extractor.StreamingService; import org.schabi.newpipe.extractor.downloader.Downloader; import org.schabi.newpipe.extractor.exceptions.ExtractionException; @@ -301,4 +302,10 @@ public class MediaCCCStreamExtractor extends StreamExtractor { public List getStreamSegments() { return Collections.emptyList(); } + + @Nonnull + @Override + public List getMetaInfo() { + return Collections.emptyList(); + } } diff --git a/extractor/src/main/java/org/schabi/newpipe/extractor/services/peertube/extractors/PeertubeSearchExtractor.java b/extractor/src/main/java/org/schabi/newpipe/extractor/services/peertube/extractors/PeertubeSearchExtractor.java index 72fcbed2e..0540c1979 100644 --- a/extractor/src/main/java/org/schabi/newpipe/extractor/services/peertube/extractors/PeertubeSearchExtractor.java +++ b/extractor/src/main/java/org/schabi/newpipe/extractor/services/peertube/extractors/PeertubeSearchExtractor.java @@ -4,6 +4,7 @@ import com.grack.nanojson.JsonObject; import com.grack.nanojson.JsonParser; import org.schabi.newpipe.extractor.InfoItem; +import org.schabi.newpipe.extractor.MetaInfo; import org.schabi.newpipe.extractor.Page; import org.schabi.newpipe.extractor.StreamingService; import org.schabi.newpipe.extractor.downloader.Downloader; @@ -17,6 +18,8 @@ import org.schabi.newpipe.extractor.services.peertube.PeertubeParsingHelper; import org.schabi.newpipe.extractor.utils.Utils; import java.io.IOException; +import java.util.Collections; +import java.util.List; import javax.annotation.Nonnull; @@ -42,6 +45,12 @@ public class PeertubeSearchExtractor extends SearchExtractor { return false; } + @Nonnull + @Override + public List getMetaInfo() { + return Collections.emptyList(); + } + @Override public InfoItemsPage getInitialPage() throws IOException, ExtractionException { final String pageUrl = getUrl() + "&" + START_KEY + "=0&" + COUNT_KEY + "=" + ITEMS_PER_PAGE; diff --git a/extractor/src/main/java/org/schabi/newpipe/extractor/services/peertube/extractors/PeertubeStreamExtractor.java b/extractor/src/main/java/org/schabi/newpipe/extractor/services/peertube/extractors/PeertubeStreamExtractor.java index 6345e5acb..61ca7fb79 100644 --- a/extractor/src/main/java/org/schabi/newpipe/extractor/services/peertube/extractors/PeertubeStreamExtractor.java +++ b/extractor/src/main/java/org/schabi/newpipe/extractor/services/peertube/extractors/PeertubeStreamExtractor.java @@ -5,6 +5,7 @@ import com.grack.nanojson.JsonObject; import com.grack.nanojson.JsonParser; import com.grack.nanojson.JsonParserException; import org.schabi.newpipe.extractor.MediaFormat; +import org.schabi.newpipe.extractor.MetaInfo; import org.schabi.newpipe.extractor.NewPipe; import org.schabi.newpipe.extractor.StreamingService; import org.schabi.newpipe.extractor.downloader.Downloader; @@ -309,6 +310,12 @@ public class PeertubeStreamExtractor extends StreamExtractor { return Collections.emptyList(); } + @Nonnull + @Override + public List getMetaInfo() { + return Collections.emptyList(); + } + private String getRelatedStreamsUrl(final List tags) throws UnsupportedEncodingException { final String url = baseUrl + PeertubeSearchQueryHandlerFactory.SEARCH_ENDPOINT; final StringBuilder params = new StringBuilder(); diff --git a/extractor/src/main/java/org/schabi/newpipe/extractor/services/soundcloud/extractors/SoundcloudSearchExtractor.java b/extractor/src/main/java/org/schabi/newpipe/extractor/services/soundcloud/extractors/SoundcloudSearchExtractor.java index 93a5e131a..b644778fe 100644 --- a/extractor/src/main/java/org/schabi/newpipe/extractor/services/soundcloud/extractors/SoundcloudSearchExtractor.java +++ b/extractor/src/main/java/org/schabi/newpipe/extractor/services/soundcloud/extractors/SoundcloudSearchExtractor.java @@ -8,6 +8,7 @@ import com.grack.nanojson.JsonParserException; import org.schabi.newpipe.extractor.InfoItem; import org.schabi.newpipe.extractor.InfoItemExtractor; import org.schabi.newpipe.extractor.InfoItemsCollector; +import org.schabi.newpipe.extractor.MetaInfo; import org.schabi.newpipe.extractor.Page; import org.schabi.newpipe.extractor.StreamingService; import org.schabi.newpipe.extractor.downloader.Downloader; @@ -22,6 +23,8 @@ import java.io.IOException; import java.io.UnsupportedEncodingException; import java.net.MalformedURLException; import java.net.URL; +import java.util.Collections; +import java.util.List; import javax.annotation.Nonnull; @@ -47,6 +50,12 @@ public class SoundcloudSearchExtractor extends SearchExtractor { return false; } + @Nonnull + @Override + public List getMetaInfo() { + return Collections.emptyList(); + } + @Nonnull @Override public InfoItemsPage getInitialPage() throws IOException, ExtractionException { diff --git a/extractor/src/main/java/org/schabi/newpipe/extractor/services/soundcloud/extractors/SoundcloudStreamExtractor.java b/extractor/src/main/java/org/schabi/newpipe/extractor/services/soundcloud/extractors/SoundcloudStreamExtractor.java index f24674de6..62e79cb2d 100644 --- a/extractor/src/main/java/org/schabi/newpipe/extractor/services/soundcloud/extractors/SoundcloudStreamExtractor.java +++ b/extractor/src/main/java/org/schabi/newpipe/extractor/services/soundcloud/extractors/SoundcloudStreamExtractor.java @@ -6,6 +6,7 @@ import com.grack.nanojson.JsonParser; import com.grack.nanojson.JsonParserException; import org.schabi.newpipe.extractor.MediaFormat; +import org.schabi.newpipe.extractor.MetaInfo; import org.schabi.newpipe.extractor.NewPipe; import org.schabi.newpipe.extractor.StreamingService; import org.schabi.newpipe.extractor.downloader.Downloader; @@ -327,4 +328,10 @@ public class SoundcloudStreamExtractor extends StreamExtractor { public List getStreamSegments() { return Collections.emptyList(); } + + @Nonnull + @Override + public List getMetaInfo() { + return Collections.emptyList(); + } } diff --git a/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/YoutubeParsingHelper.java b/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/YoutubeParsingHelper.java index b36fe039a..39aba79af 100644 --- a/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/YoutubeParsingHelper.java +++ b/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/YoutubeParsingHelper.java @@ -76,19 +76,16 @@ public class YoutubeParsingHelper { private static final String FEED_BASE_CHANNEL_ID = "https://www.youtube.com/feeds/videos.xml?channel_id="; private static final String FEED_BASE_USER = "https://www.youtube.com/feeds/videos.xml?user="; - private static final String[] RECAPTCHA_DETECTION_SELECTORS = { - "form[action*=\"/das_captcha\"]", - "input[name*=\"action_recaptcha_verify\"]" - }; - - public static Document parseAndCheckPage(final String url, final Response response) throws ReCaptchaException { - final Document document = Jsoup.parse(response.responseBody(), url); - - for (String detectionSelector : RECAPTCHA_DETECTION_SELECTORS) { - if (!document.select(detectionSelector).isEmpty()) { - throw new ReCaptchaException("reCAPTCHA challenge requested (detected with selector: \"" + detectionSelector + "\")", url); - } + private static boolean isGoogleURL(String url) { + url = extractCachedUrlIfNeeded(url); + try { + final URL u = new URL(url); + final String host = u.getHost(); + return host.startsWith("google.") || host.startsWith("m.google."); + } catch (MalformedURLException e) { + return false; } + } return document; } @@ -650,4 +647,124 @@ public class YoutubeParsingHelper { } } } + + @Nonnull + public static List getMetaInfo(final JsonArray contents) throws ParsingException { + final List metaInfo = new ArrayList<>(); + for (final Object content : contents) { + final JsonObject resultObject = (JsonObject) content; + if (resultObject.has("itemSectionRenderer")) { + for (final Object sectionContentObject : + resultObject.getObject("itemSectionRenderer").getArray("contents")) { + + final JsonObject sectionContent = (JsonObject) sectionContentObject; + if (sectionContent.has("infoPanelContentRenderer")) { + metaInfo.add(getInfoPanelContent(sectionContent.getObject("infoPanelContentRenderer"))); + } + if (sectionContent.has("clarificationRenderer")) { + metaInfo.add(getClarificationRendererContent(sectionContent.getObject("clarificationRenderer") + )); + } + + } + } + } + return metaInfo; + } + + @Nonnull + private static MetaInfo getInfoPanelContent(final JsonObject infoPanelContentRenderer) + throws ParsingException { + final MetaInfo metaInfo = new MetaInfo(); + final StringBuilder sb = new StringBuilder(); + for (final Object paragraph : infoPanelContentRenderer.getArray("paragraphs")) { + if (sb.length() != 0) { + sb.append("
"); + } + sb.append(YoutubeParsingHelper.getTextFromObject((JsonObject) paragraph)); + } + metaInfo.setContent(new Description(sb.toString(), Description.HTML)); + if (infoPanelContentRenderer.has("sourceEndpoint")) { + final String metaInfoLinkUrl = YoutubeParsingHelper.getUrlFromNavigationEndpoint( + infoPanelContentRenderer.getObject("sourceEndpoint")); + try { + metaInfo.addUrl(new URL(Objects.requireNonNull(extractCachedUrlIfNeeded(metaInfoLinkUrl)))); + } catch (final NullPointerException | MalformedURLException e) { + throw new ParsingException("Could not get metadata info URL", e); + } + + final String metaInfoLinkText = YoutubeParsingHelper.getTextFromObject( + infoPanelContentRenderer.getObject("inlineSource")); + if (isNullOrEmpty(metaInfoLinkText)) { + throw new ParsingException("Could not get metadata info link text."); + } + metaInfo.addUrlText(metaInfoLinkText); + } + + return metaInfo; + } + + @Nonnull + private static MetaInfo getClarificationRendererContent(final JsonObject clarificationRenderer) + throws ParsingException { + final MetaInfo metaInfo = new MetaInfo(); + + final String title = YoutubeParsingHelper.getTextFromObject(clarificationRenderer.getObject("contentTitle")); + final String text = YoutubeParsingHelper.getTextFromObject(clarificationRenderer.getObject("text")); + if (title == null || text == null) { + throw new ParsingException("Could not extract clarification renderer content"); + } + metaInfo.setTitle(title); + metaInfo.setContent(new Description(text, Description.PLAIN_TEXT)); + + if (clarificationRenderer.has("actionButton")) { + final JsonObject actionButton = clarificationRenderer.getObject("actionButton") + .getObject("buttonRenderer"); + try { + final String url = YoutubeParsingHelper.getUrlFromNavigationEndpoint(actionButton.getObject("command")); + metaInfo.addUrl(new URL(Objects.requireNonNull(extractCachedUrlIfNeeded(url)))); + } catch (final NullPointerException | MalformedURLException e) { + throw new ParsingException("Could not get metadata info URL", e); + } + + final String metaInfoLinkText = YoutubeParsingHelper.getTextFromObject( + actionButton.getObject("text")); + if (isNullOrEmpty(metaInfoLinkText)) { + throw new ParsingException("Could not get metadata info link text."); + } + metaInfo.addUrlText(metaInfoLinkText); + } + + if (clarificationRenderer.has("secondaryEndpoint") && clarificationRenderer.has("secondarySource")) { + final String url = getUrlFromNavigationEndpoint(clarificationRenderer.getObject("secondaryEndpoint")); + // ignore Google URLs, because those point to a Google search about "Covid-19" + if (url != null && !isGoogleURL(url)) { + try { + metaInfo.addUrl(new URL(url)); + final String description = getTextFromObject(clarificationRenderer.getObject("secondarySource")); + metaInfo.addUrlText(description == null ? url : description); + } catch (MalformedURLException e) { + throw new ParsingException("Could not get metadata info secondary URL", e); + } + } + } + + return metaInfo; + } + + /** + * Sometimes, YouTube provides URLs which use Google's cache. They look like + * {@code https://webcache.googleusercontent.com/search?q=cache:CACHED_URL} + * @param url the URL which might refer to the Google's webcache + * @return the URL which is referring to the original site + */ + public static String extractCachedUrlIfNeeded(final String url) { + if (url == null) { + return null; + } + if (url.contains("webcache.googleusercontent.com")) { + return url.split("cache:")[1]; + } + return url; + } } diff --git a/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/extractors/YoutubeMusicSearchExtractor.java b/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/extractors/YoutubeMusicSearchExtractor.java index 249a2dae1..c7a4af8ce 100644 --- a/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/extractors/YoutubeMusicSearchExtractor.java +++ b/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/extractors/YoutubeMusicSearchExtractor.java @@ -7,6 +7,7 @@ import com.grack.nanojson.JsonParserException; import com.grack.nanojson.JsonWriter; import org.schabi.newpipe.extractor.InfoItem; +import org.schabi.newpipe.extractor.MetaInfo; import org.schabi.newpipe.extractor.Page; import org.schabi.newpipe.extractor.StreamingService; import org.schabi.newpipe.extractor.downloader.Downloader; @@ -163,6 +164,12 @@ public class YoutubeMusicSearchExtractor extends SearchExtractor { return !showingResultsForRenderer.isEmpty(); } + @Nonnull + @Override + public List getMetaInfo() { + return Collections.emptyList(); + } + @Nonnull @Override public InfoItemsPage getInitialPage() throws ExtractionException, IOException { diff --git a/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/extractors/YoutubeSearchExtractor.java b/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/extractors/YoutubeSearchExtractor.java index 6e7d41c48..783dbf1e7 100644 --- a/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/extractors/YoutubeSearchExtractor.java +++ b/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/extractors/YoutubeSearchExtractor.java @@ -7,6 +7,7 @@ import com.grack.nanojson.JsonParserException; import com.grack.nanojson.JsonWriter; import org.schabi.newpipe.extractor.InfoItem; +import org.schabi.newpipe.extractor.MetaInfo; import org.schabi.newpipe.extractor.Page; import org.schabi.newpipe.extractor.StreamingService; import org.schabi.newpipe.extractor.downloader.Downloader; @@ -16,13 +17,11 @@ import org.schabi.newpipe.extractor.linkhandler.SearchQueryHandler; import org.schabi.newpipe.extractor.localization.TimeAgoParser; import org.schabi.newpipe.extractor.search.InfoItemsSearchCollector; import org.schabi.newpipe.extractor.search.SearchExtractor; +import org.schabi.newpipe.extractor.services.youtube.YoutubeParsingHelper; import org.schabi.newpipe.extractor.utils.JsonUtils; import java.io.IOException; -import java.util.Collections; -import java.util.HashMap; -import java.util.List; -import java.util.Map; +import java.util.*; import javax.annotation.Nonnull; @@ -106,6 +105,13 @@ public class YoutubeSearchExtractor extends SearchExtractor { return !showingResultsForRenderer.isEmpty(); } + @Override + public List getMetaInfo() throws ParsingException { + return YoutubeParsingHelper.getMetaInfo( + initialData.getObject("contents").getObject("twoColumnSearchResultsRenderer") + .getObject("primaryContents").getObject("sectionListRenderer").getArray("contents")); + } + @Nonnull @Override public InfoItemsPage getInitialPage() throws IOException, ExtractionException { diff --git a/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/extractors/YoutubeStreamExtractor.java b/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/extractors/YoutubeStreamExtractor.java index cca69dd21..25eb3e89e 100644 --- a/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/extractors/YoutubeStreamExtractor.java +++ b/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/extractors/YoutubeStreamExtractor.java @@ -13,6 +13,7 @@ import org.mozilla.javascript.Context; import org.mozilla.javascript.Function; import org.mozilla.javascript.ScriptableObject; import org.schabi.newpipe.extractor.MediaFormat; +import org.schabi.newpipe.extractor.MetaInfo; import org.schabi.newpipe.extractor.NewPipe; import org.schabi.newpipe.extractor.StreamingService; import org.schabi.newpipe.extractor.downloader.Downloader; @@ -45,6 +46,9 @@ import org.schabi.newpipe.extractor.utils.Utils; import javax.annotation.Nonnull; import javax.annotation.Nullable; import java.io.IOException; +import java.io.UnsupportedEncodingException; +import java.net.MalformedURLException; +import java.net.URL; import java.time.LocalDate; import java.time.OffsetDateTime; import java.time.format.DateTimeFormatter; @@ -1118,4 +1122,12 @@ public class YoutubeStreamExtractor extends StreamExtractor { } return segments; } + + @Nonnull + @Override + public List getMetaInfo() throws ParsingException { + return YoutubeParsingHelper.getMetaInfo( + initialData.getObject("contents").getObject("twoColumnWatchNextResults") + .getObject("results").getObject("results").getArray("contents")); + } } diff --git a/extractor/src/main/java/org/schabi/newpipe/extractor/stream/StreamExtractor.java b/extractor/src/main/java/org/schabi/newpipe/extractor/stream/StreamExtractor.java index dca4bbbc3..faf6b0efc 100644 --- a/extractor/src/main/java/org/schabi/newpipe/extractor/stream/StreamExtractor.java +++ b/extractor/src/main/java/org/schabi/newpipe/extractor/stream/StreamExtractor.java @@ -22,6 +22,7 @@ package org.schabi.newpipe.extractor.stream; import org.schabi.newpipe.extractor.Extractor; import org.schabi.newpipe.extractor.MediaFormat; +import org.schabi.newpipe.extractor.MetaInfo; import org.schabi.newpipe.extractor.StreamingService; import org.schabi.newpipe.extractor.channel.ChannelExtractor; import org.schabi.newpipe.extractor.exceptions.ExtractionException; @@ -486,4 +487,18 @@ public abstract class StreamExtractor extends Extractor { */ @Nonnull public abstract List getStreamSegments() throws ParsingException; + + /** + * Meta information about the stream. + *

+ * This can be information about the stream creator (e.g. if the creator is a public broadcaster) + * or further information on the topic (e.g. hints that the video might contain conspiracy theories + * or contains information about a current health situation like the Covid-19 pandemic). + *

+ * The meta information often contains links to external sources like Wikipedia or the WHO. + * @return The meta info of the stream or an empty List if not provided. + * @throws ParsingException + */ + @Nonnull + public abstract List getMetaInfo() throws ParsingException; } diff --git a/extractor/src/main/java/org/schabi/newpipe/extractor/stream/StreamInfo.java b/extractor/src/main/java/org/schabi/newpipe/extractor/stream/StreamInfo.java index 18eab21a7..8fa325544 100644 --- a/extractor/src/main/java/org/schabi/newpipe/extractor/stream/StreamInfo.java +++ b/extractor/src/main/java/org/schabi/newpipe/extractor/stream/StreamInfo.java @@ -1,9 +1,6 @@ package org.schabi.newpipe.extractor.stream; -import org.schabi.newpipe.extractor.Info; -import org.schabi.newpipe.extractor.InfoItem; -import org.schabi.newpipe.extractor.NewPipe; -import org.schabi.newpipe.extractor.StreamingService; +import org.schabi.newpipe.extractor.*; import org.schabi.newpipe.extractor.exceptions.ContentNotAvailableException; import org.schabi.newpipe.extractor.exceptions.ContentNotSupportedException; import org.schabi.newpipe.extractor.exceptions.ExtractionException; @@ -13,9 +10,12 @@ import org.schabi.newpipe.extractor.utils.ExtractorHelper; import java.io.IOException; import java.util.ArrayList; +import java.util.Collections; import java.util.List; import java.util.Locale; +import javax.annotation.Nonnull; + import static org.schabi.newpipe.extractor.utils.Utils.isNullOrEmpty; /* @@ -329,6 +329,11 @@ public class StreamInfo extends Info { } catch (Exception e) { streamInfo.addError(e); } + try { + streamInfo.setMetaInfo(extractor.getMetaInfo()); + } catch (Exception e) { + streamInfo.addError(e); + } streamInfo.setRelatedStreams(ExtractorHelper.getRelatedVideosOrLogError(streamInfo, extractor)); @@ -379,6 +384,7 @@ public class StreamInfo extends Info { private Locale language = null; private List tags = new ArrayList<>(); private List streamSegments = new ArrayList<>(); + private List metaInfo = new ArrayList<>(); /** * Get the stream type @@ -684,4 +690,13 @@ public class StreamInfo extends Info { public void setStreamSegments(List streamSegments) { this.streamSegments = streamSegments; } + + public void setMetaInfo(final List metaInfo) { + this.metaInfo = metaInfo; + } + + @Nonnull + public List getMetaInfo() { + return this.metaInfo; + } } diff --git a/extractor/src/test/java/org/schabi/newpipe/extractor/services/DefaultSearchExtractorTest.java b/extractor/src/test/java/org/schabi/newpipe/extractor/services/DefaultSearchExtractorTest.java index 8a8a4e53d..61ee11d0a 100644 --- a/extractor/src/test/java/org/schabi/newpipe/extractor/services/DefaultSearchExtractorTest.java +++ b/extractor/src/test/java/org/schabi/newpipe/extractor/services/DefaultSearchExtractorTest.java @@ -1,12 +1,20 @@ package org.schabi.newpipe.extractor.services; import org.junit.Test; +import org.schabi.newpipe.extractor.MetaInfo; import org.schabi.newpipe.extractor.search.SearchExtractor; import javax.annotation.Nullable; +import java.net.MalformedURLException; +import java.net.URL; +import java.util.Collections; +import java.util.List; +import java.util.stream.Collectors; + import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; import static org.schabi.newpipe.extractor.ExtractorAsserts.assertEmpty; import static org.schabi.newpipe.extractor.utils.Utils.isNullOrEmpty; @@ -20,6 +28,10 @@ public abstract class DefaultSearchExtractorTest extends DefaultListExtractorTes return false; } + public List expectedMetaInfo() throws MalformedURLException { + return Collections.emptyList(); + } + @Test @Override public void testSearchString() throws Exception { @@ -41,4 +53,34 @@ public abstract class DefaultSearchExtractorTest extends DefaultListExtractorTes public void testSearchCorrected() throws Exception { assertEquals(isCorrectedSearch(), extractor().isCorrectedSearch()); } + + /** + * @see DefaultStreamExtractorTest#testMetaInfo() + */ + @Test + public void testMetaInfo() throws Exception { + final List metaInfoList = extractor().getMetaInfo(); + final List expectedMetaInfoList = expectedMetaInfo(); + + for (final MetaInfo expectedMetaInfo : expectedMetaInfoList) { + final List texts = metaInfoList.stream() + .map(metaInfo -> metaInfo.getContent().getContent()) + .collect(Collectors.toList()); + final List titles = metaInfoList.stream().map(MetaInfo::getTitle).collect(Collectors.toList()); + final List urls = metaInfoList.stream().flatMap(info -> info.getUrls().stream()) + .collect(Collectors.toList()); + final List urlTexts = metaInfoList.stream().flatMap(info -> info.getUrlTexts().stream()) + .collect(Collectors.toList()); + + assertTrue(texts.contains(expectedMetaInfo.getContent().getContent())); + assertTrue(titles.contains(expectedMetaInfo.getTitle())); + + for (final String expectedUrlText : expectedMetaInfo.getUrlTexts()) { + assertTrue(urlTexts.contains(expectedUrlText)); + } + for (final URL expectedUrl : expectedMetaInfo.getUrls()) { + assertTrue(urls.contains(expectedUrl)); + } + } + } } diff --git a/extractor/src/test/java/org/schabi/newpipe/extractor/services/DefaultStreamExtractorTest.java b/extractor/src/test/java/org/schabi/newpipe/extractor/services/DefaultStreamExtractorTest.java index b3a1a0a81..b4c2be5e3 100644 --- a/extractor/src/test/java/org/schabi/newpipe/extractor/services/DefaultStreamExtractorTest.java +++ b/extractor/src/test/java/org/schabi/newpipe/extractor/services/DefaultStreamExtractorTest.java @@ -2,6 +2,7 @@ package org.schabi.newpipe.extractor.services; import org.junit.Test; import org.schabi.newpipe.extractor.MediaFormat; +import org.schabi.newpipe.extractor.MetaInfo; import org.schabi.newpipe.extractor.localization.DateWrapper; import org.schabi.newpipe.extractor.stream.AudioStream; import org.schabi.newpipe.extractor.stream.Description; @@ -15,9 +16,12 @@ import org.schabi.newpipe.extractor.stream.VideoStream; import javax.annotation.Nullable; import java.time.LocalDateTime; import java.time.format.DateTimeFormatter; +import java.net.MalformedURLException; +import java.net.URL; import java.util.Collections; import java.util.List; import java.util.Locale; +import java.util.stream.Collectors; import static org.hamcrest.CoreMatchers.containsString; import static org.hamcrest.MatcherAssert.assertThat; @@ -67,6 +71,7 @@ public abstract class DefaultStreamExtractorTest extends DefaultExtractorTest expectedTags() { return Collections.emptyList(); } // default: no tags public String expectedSupportInfo() { return ""; } // default: no support info available public int expectedStreamSegmentsCount() { return -1; } // return 0 or greater to test (default is -1 to ignore) + public List expectedMetaInfo() throws MalformedURLException { return Collections.emptyList(); } // default: no metadata info available @Test @Override @@ -387,4 +392,35 @@ public abstract class DefaultStreamExtractorTest extends DefaultExtractorTest metaInfoList = extractor().getMetaInfo(); + final List expectedMetaInfoList = expectedMetaInfo(); + + for (final MetaInfo expectedMetaInfo : expectedMetaInfoList) { + final List texts = metaInfoList.stream() + .map((metaInfo) -> metaInfo.getContent().getContent()) + .collect(Collectors.toList()); + final List titles = metaInfoList.stream().map(MetaInfo::getTitle).collect(Collectors.toList()); + final List urls = metaInfoList.stream().flatMap(info -> info.getUrls().stream()) + .collect(Collectors.toList()); + final List urlTexts = metaInfoList.stream().flatMap(info -> info.getUrlTexts().stream()) + .collect(Collectors.toList()); + + assertTrue(texts.contains(expectedMetaInfo.getContent().getContent())); + assertTrue(titles.contains(expectedMetaInfo.getTitle())); + + for (final String expectedUrlText : expectedMetaInfo.getUrlTexts()) { + assertTrue(urlTexts.contains(expectedUrlText)); + } + for (final URL expectedUrl : expectedMetaInfo.getUrls()) { + assertTrue(urls.contains(expectedUrl)); + } + } + + } } diff --git a/extractor/src/test/java/org/schabi/newpipe/extractor/services/youtube/YoutubeParsingHelperTest.java b/extractor/src/test/java/org/schabi/newpipe/extractor/services/youtube/YoutubeParsingHelperTest.java index 34a300876..81e2078aa 100644 --- a/extractor/src/test/java/org/schabi/newpipe/extractor/services/youtube/YoutubeParsingHelperTest.java +++ b/extractor/src/test/java/org/schabi/newpipe/extractor/services/youtube/YoutubeParsingHelperTest.java @@ -36,4 +36,12 @@ public class YoutubeParsingHelperTest { assertEquals(4445767, YoutubeParsingHelper.parseDurationString("1,234:56:07")); assertEquals(754, YoutubeParsingHelper.parseDurationString("12:34 ")); } + + @Test + public void testConvertFromGoogleCacheUrl() throws ParsingException { + assertEquals("https://mohfw.gov.in/", + YoutubeParsingHelper.extractCachedUrlIfNeeded("https://webcache.googleusercontent.com/search?q=cache:https://mohfw.gov.in/")); + assertEquals("https://www.infektionsschutz.de/coronavirus-sars-cov-2.html", + YoutubeParsingHelper.extractCachedUrlIfNeeded("https://www.infektionsschutz.de/coronavirus-sars-cov-2.html")); + } } diff --git a/extractor/src/test/java/org/schabi/newpipe/extractor/services/youtube/search/YoutubeSearchExtractorTest.java b/extractor/src/test/java/org/schabi/newpipe/extractor/services/youtube/search/YoutubeSearchExtractorTest.java index 6e07b0211..fc8ebbe6f 100644 --- a/extractor/src/test/java/org/schabi/newpipe/extractor/services/youtube/search/YoutubeSearchExtractorTest.java +++ b/extractor/src/test/java/org/schabi/newpipe/extractor/services/youtube/search/YoutubeSearchExtractorTest.java @@ -3,15 +3,21 @@ package org.schabi.newpipe.extractor.services.youtube.search; import org.junit.BeforeClass; import org.junit.Test; import org.schabi.newpipe.DownloaderTestImpl; -import org.schabi.newpipe.extractor.InfoItem; -import org.schabi.newpipe.extractor.ListExtractor; -import org.schabi.newpipe.extractor.NewPipe; -import org.schabi.newpipe.extractor.StreamingService; +import org.schabi.newpipe.extractor.*; import org.schabi.newpipe.extractor.search.SearchExtractor; import org.schabi.newpipe.extractor.services.DefaultSearchExtractorTest; +import org.schabi.newpipe.extractor.services.youtube.YoutubeService; +import org.schabi.newpipe.extractor.stream.Description; import javax.annotation.Nullable; +import java.net.MalformedURLException; +import java.net.URL; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; + import static java.util.Collections.singletonList; import static junit.framework.TestCase.assertFalse; import static org.junit.Assert.assertEquals; @@ -211,4 +217,39 @@ public class YoutubeSearchExtractorTest { assertNoDuplicatedItems(YouTube, page1, page2); } } + + public static class MetaInfoTest extends DefaultSearchExtractorTest { + private static SearchExtractor extractor; + private static final String QUERY = "Covid"; + + @Test + public void clarificationTest() throws Exception { + NewPipe.init(DownloaderTestImpl.getInstance()); + extractor = YouTube.getSearchExtractor(QUERY, singletonList(VIDEOS), ""); + extractor.fetchPage(); + } + + @Override public String expectedSearchString() { return QUERY; } + @Override public String expectedSearchSuggestion() { return null; } + @Override public List expectedMetaInfo() throws MalformedURLException { + final List urls = new ArrayList<>(); + urls.add(new URL("https://www.who.int/emergencies/diseases/novel-coronavirus-2019")); + urls.add(new URL("https://www.who.int/emergencies/diseases/novel-coronavirus-2019/covid-19-vaccines")); + final List urlTexts = new ArrayList<>(); + urlTexts.add("LEARN MORE"); + urlTexts.add("Learn about vaccine progress from the WHO"); + return Collections.singletonList(new MetaInfo( + "COVID-19", + new Description("Get the latest information from the WHO about coronavirus.", Description.PLAIN_TEXT), + urls, + urlTexts + )); + } + @Override public SearchExtractor extractor() { return extractor; } + @Override public StreamingService expectedService() { return YouTube; } + @Override public String expectedName() { return QUERY; } + @Override public String expectedId() { return QUERY; } + @Override public String expectedUrlContains() { return "youtube.com/results?search_query=" + QUERY; } + @Override public String expectedOriginalUrlContains() throws Exception { return "youtube.com/results?search_query=" + QUERY; } + } } diff --git a/extractor/src/test/java/org/schabi/newpipe/extractor/services/youtube/stream/YoutubeStreamExtractorDefaultTest.java b/extractor/src/test/java/org/schabi/newpipe/extractor/services/youtube/stream/YoutubeStreamExtractorDefaultTest.java index 8c2022874..07d97181f 100644 --- a/extractor/src/test/java/org/schabi/newpipe/extractor/services/youtube/stream/YoutubeStreamExtractorDefaultTest.java +++ b/extractor/src/test/java/org/schabi/newpipe/extractor/services/youtube/stream/YoutubeStreamExtractorDefaultTest.java @@ -3,16 +3,22 @@ package org.schabi.newpipe.extractor.services.youtube.stream; import org.junit.BeforeClass; import org.junit.Test; import org.schabi.newpipe.DownloaderTestImpl; +import org.schabi.newpipe.extractor.MetaInfo; import org.schabi.newpipe.extractor.NewPipe; import org.schabi.newpipe.extractor.StreamingService; import org.schabi.newpipe.extractor.exceptions.ContentNotAvailableException; import org.schabi.newpipe.extractor.exceptions.ParsingException; import org.schabi.newpipe.extractor.services.DefaultStreamExtractorTest; +import org.schabi.newpipe.extractor.stream.Description; import org.schabi.newpipe.extractor.stream.StreamExtractor; import org.schabi.newpipe.extractor.stream.StreamSegment; import org.schabi.newpipe.extractor.stream.StreamType; +import javax.annotation.Nullable; +import java.net.MalformedURLException; +import java.net.URL; import java.util.Arrays; +import java.util.Collections; import java.util.List; import javax.annotation.Nullable; @@ -258,4 +264,46 @@ public class YoutubeStreamExtractorDefaultTest { assertNotNull(segment.getPreviewUrl()); } } + + public static class PublicBroadcasterTest extends DefaultStreamExtractorTest { + private static final String ID = "q6fgbYWsMgw"; + private static final int TIMESTAMP = 0; + private static final String URL = BASE_URL + ID; + private static StreamExtractor extractor; + + @BeforeClass + public static void setUp() throws Exception { + NewPipe.init(DownloaderTestImpl.getInstance()); + extractor = YouTube.getStreamExtractor(URL); + extractor.fetchPage(); + } + + @Override public StreamExtractor extractor() { return extractor; } + @Override public StreamingService expectedService() { return YouTube; } + @Override public String expectedName() { return "Was verbirgt sich am tiefsten Punkt des Ozeans?"; } + @Override public String expectedId() { return ID; } + @Override public String expectedUrlContains() { return BASE_URL + ID; } + @Override public String expectedOriginalUrlContains() { return URL; } + + @Override public StreamType expectedStreamType() { return StreamType.VIDEO_STREAM; } + @Override public String expectedUploaderName() { return "Dinge Erklärt – Kurzgesagt"; } + @Override public String expectedUploaderUrl() { return "https://www.youtube.com/channel/UCwRH985XgMYXQ6NxXDo8npw"; } + @Override public List expectedDescriptionContains() { return Arrays.asList("Lasst uns abtauchen!", "Angebot von funk", "Dinge"); } + @Override public long expectedLength() { return 631; } + @Override public long expectedTimestamp() { return TIMESTAMP; } + @Override public long expectedViewCountAtLeast() { return 1_600_000; } + @Nullable @Override public String expectedUploadDate() { return "2019-06-12 00:00:00.000"; } + @Nullable @Override public String expectedTextualUploadDate() { return "2019-06-12"; } + @Override public long expectedLikeCountAtLeast() { return 70000; } + @Override public long expectedDislikeCountAtLeast() { return 500; } + @Override public List expectedMetaInfo() throws MalformedURLException { + return Collections.singletonList(new MetaInfo( + "", + new Description("Funk is a German public broadcast service.", Description.PLAIN_TEXT), + Collections.singletonList(new URL("https://de.wikipedia.org/wiki/Funk_(Medienangebot)?wprov=yicw1")), + Collections.singletonList("Wikipedia (German)") + )); + } + } + }