From 26234a1c0fa7cf0a03a47a38948f0ee4f0df29a7 Mon Sep 17 00:00:00 2001 From: Mauricio Colli Date: Mon, 16 Dec 2019 04:35:44 -0300 Subject: [PATCH] Introduce FeedExtractor making fetching from dedicated feeds possible YouTube, for example, has a dedicated feed which was built to be used like this. --- .../newpipe/extractor/StreamingService.java | 22 ++++- .../newpipe/extractor/feed/FeedExtractor.java | 17 ++++ .../newpipe/extractor/feed/FeedInfo.java | 52 ++++++++++ .../services/youtube/YoutubeService.java | 20 ++-- .../extractors/YoutubeChannelExtractor.java | 3 +- .../extractors/YoutubeFeedExtractor.java | 82 ++++++++++++++++ .../YoutubeFeedInfoItemExtractor.java | 94 +++++++++++++++++++ .../linkHandler/YoutubeParsingHelper.java | 13 +++ .../youtube/YoutubeFeedExtractorTest.java | 72 ++++++++++++++ 9 files changed, 361 insertions(+), 14 deletions(-) create mode 100644 extractor/src/main/java/org/schabi/newpipe/extractor/feed/FeedExtractor.java create mode 100644 extractor/src/main/java/org/schabi/newpipe/extractor/feed/FeedInfo.java create mode 100644 extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/extractors/YoutubeFeedExtractor.java create mode 100644 extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/extractors/YoutubeFeedInfoItemExtractor.java create mode 100644 extractor/src/test/java/org/schabi/newpipe/extractor/services/youtube/YoutubeFeedExtractorTest.java diff --git a/extractor/src/main/java/org/schabi/newpipe/extractor/StreamingService.java b/extractor/src/main/java/org/schabi/newpipe/extractor/StreamingService.java index ad22642db..683985062 100644 --- a/extractor/src/main/java/org/schabi/newpipe/extractor/StreamingService.java +++ b/extractor/src/main/java/org/schabi/newpipe/extractor/StreamingService.java @@ -7,6 +7,7 @@ import org.schabi.newpipe.extractor.channel.ChannelExtractor; import org.schabi.newpipe.extractor.comments.CommentsExtractor; import org.schabi.newpipe.extractor.exceptions.ExtractionException; import org.schabi.newpipe.extractor.exceptions.ParsingException; +import org.schabi.newpipe.extractor.feed.FeedExtractor; import org.schabi.newpipe.extractor.kiosk.KioskList; import org.schabi.newpipe.extractor.linkhandler.LinkHandler; import org.schabi.newpipe.extractor.linkhandler.LinkHandlerFactory; @@ -24,6 +25,8 @@ import org.schabi.newpipe.extractor.stream.StreamExtractor; import org.schabi.newpipe.extractor.subscription.SubscriptionExtractor; import org.schabi.newpipe.extractor.suggestion.SuggestionExtractor; +import javax.annotation.Nullable; + /* * Copyright (C) Christian Schabesberger 2018 * StreamingService.java is part of NewPipe. @@ -65,7 +68,7 @@ public abstract class StreamingService { public String getName() { return name; } - + public List getMediaCapabilities() { return mediaCapabilities; } @@ -116,7 +119,7 @@ public abstract class StreamingService { public String toString() { return serviceId + ":" + serviceInfo.getName(); } - + public abstract String getBaseUrl(); /*////////////////////////////////////////////////////////////////////////// @@ -173,6 +176,19 @@ public abstract class StreamingService { */ public abstract SubscriptionExtractor getSubscriptionExtractor(); + /** + * This method decides which strategy will be chosen to fetch the feed. In YouTube, for example, a separate feed + * exists which is lightweight and made specifically to be used like this. + *

+ * In services which there's no other way to retrieve them, null should be returned. + * + * @return a {@link FeedExtractor} instance or null. + */ + @Nullable + public FeedExtractor getFeedExtractor(String url) throws ExtractionException { + return null; + } + /** * Must create a new instance of a KioskList implementation. * @return a new KioskList instance @@ -258,7 +274,7 @@ public abstract class StreamingService { } return getCommentsExtractor(llhf.fromUrl(url)); } - + /*////////////////////////////////////////////////////////////////////////// // Utils //////////////////////////////////////////////////////////////////////////*/ diff --git a/extractor/src/main/java/org/schabi/newpipe/extractor/feed/FeedExtractor.java b/extractor/src/main/java/org/schabi/newpipe/extractor/feed/FeedExtractor.java new file mode 100644 index 000000000..df3e8915b --- /dev/null +++ b/extractor/src/main/java/org/schabi/newpipe/extractor/feed/FeedExtractor.java @@ -0,0 +1,17 @@ +package org.schabi.newpipe.extractor.feed; + +import org.schabi.newpipe.extractor.ListExtractor; +import org.schabi.newpipe.extractor.StreamingService; +import org.schabi.newpipe.extractor.linkhandler.ListLinkHandler; +import org.schabi.newpipe.extractor.stream.StreamInfoItem; + +/** + * This class helps to extract items from lightweight feeds that the services may provide. + *

+ * YouTube is an example of a service that has this alternative available. + */ +public abstract class FeedExtractor extends ListExtractor { + public FeedExtractor(StreamingService service, ListLinkHandler listLinkHandler) { + super(service, listLinkHandler); + } +} diff --git a/extractor/src/main/java/org/schabi/newpipe/extractor/feed/FeedInfo.java b/extractor/src/main/java/org/schabi/newpipe/extractor/feed/FeedInfo.java new file mode 100644 index 000000000..f361cec7e --- /dev/null +++ b/extractor/src/main/java/org/schabi/newpipe/extractor/feed/FeedInfo.java @@ -0,0 +1,52 @@ +package org.schabi.newpipe.extractor.feed; + +import org.schabi.newpipe.extractor.ListExtractor.InfoItemsPage; +import org.schabi.newpipe.extractor.ListInfo; +import org.schabi.newpipe.extractor.NewPipe; +import org.schabi.newpipe.extractor.StreamingService; +import org.schabi.newpipe.extractor.exceptions.ExtractionException; +import org.schabi.newpipe.extractor.stream.StreamInfoItem; +import org.schabi.newpipe.extractor.utils.ExtractorHelper; + +import java.io.IOException; +import java.util.List; + +public class FeedInfo extends ListInfo { + + public FeedInfo(int serviceId, String id, String url, String originalUrl, String name, List contentFilter, String sortFilter) { + super(serviceId, id, url, originalUrl, name, contentFilter, sortFilter); + } + + public static FeedInfo getInfo(String url) throws IOException, ExtractionException { + return getInfo(NewPipe.getServiceByUrl(url), url); + } + + public static FeedInfo getInfo(StreamingService service, String url) throws IOException, ExtractionException { + final FeedExtractor extractor = service.getFeedExtractor(url); + + if (extractor == null) { + throw new IllegalArgumentException("Service \"" + service.getServiceInfo().getName() + "\" doesn't support FeedExtractor."); + } + + extractor.fetchPage(); + return getInfo(extractor); + } + + public static FeedInfo getInfo(FeedExtractor extractor) throws IOException, ExtractionException { + extractor.fetchPage(); + + final int serviceId = extractor.getServiceId(); + final String id = extractor.getId(); + final String url = extractor.getUrl(); + final String originalUrl = extractor.getOriginalUrl(); + final String name = extractor.getName(); + + final FeedInfo info = new FeedInfo(serviceId, id, url, originalUrl, name, null, null); + + final InfoItemsPage itemsPage = ExtractorHelper.getItemsPageOrLogError(info, extractor); + info.setRelatedItems(itemsPage.getItems()); + info.setNextPageUrl(itemsPage.getNextPageUrl()); + + return info; + } +} diff --git a/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/YoutubeService.java b/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/YoutubeService.java index 78c97cfbe..6137f0293 100644 --- a/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/YoutubeService.java +++ b/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/YoutubeService.java @@ -12,6 +12,7 @@ import org.schabi.newpipe.extractor.StreamingService; import org.schabi.newpipe.extractor.channel.ChannelExtractor; import org.schabi.newpipe.extractor.comments.CommentsExtractor; import org.schabi.newpipe.extractor.exceptions.ExtractionException; +import org.schabi.newpipe.extractor.feed.FeedExtractor; import org.schabi.newpipe.extractor.kiosk.KioskExtractor; import org.schabi.newpipe.extractor.kiosk.KioskList; import org.schabi.newpipe.extractor.linkhandler.LinkHandler; @@ -24,14 +25,7 @@ import org.schabi.newpipe.extractor.localization.ContentCountry; import org.schabi.newpipe.extractor.localization.Localization; import org.schabi.newpipe.extractor.playlist.PlaylistExtractor; import org.schabi.newpipe.extractor.search.SearchExtractor; -import org.schabi.newpipe.extractor.services.youtube.extractors.YoutubeChannelExtractor; -import org.schabi.newpipe.extractor.services.youtube.extractors.YoutubeCommentsExtractor; -import org.schabi.newpipe.extractor.services.youtube.extractors.YoutubePlaylistExtractor; -import org.schabi.newpipe.extractor.services.youtube.extractors.YoutubeSearchExtractor; -import org.schabi.newpipe.extractor.services.youtube.extractors.YoutubeStreamExtractor; -import org.schabi.newpipe.extractor.services.youtube.extractors.YoutubeSubscriptionExtractor; -import org.schabi.newpipe.extractor.services.youtube.extractors.YoutubeSuggestionExtractor; -import org.schabi.newpipe.extractor.services.youtube.extractors.YoutubeTrendingExtractor; +import org.schabi.newpipe.extractor.services.youtube.extractors.*; import org.schabi.newpipe.extractor.services.youtube.linkHandler.YoutubeChannelLinkHandlerFactory; import org.schabi.newpipe.extractor.services.youtube.linkHandler.YoutubeCommentsLinkHandlerFactory; import org.schabi.newpipe.extractor.services.youtube.linkHandler.YoutubePlaylistLinkHandlerFactory; @@ -42,6 +36,8 @@ import org.schabi.newpipe.extractor.stream.StreamExtractor; import org.schabi.newpipe.extractor.subscription.SubscriptionExtractor; import org.schabi.newpipe.extractor.suggestion.SuggestionExtractor; +import javax.annotation.Nonnull; + /* * Created by Christian Schabesberger on 23.08.15. * @@ -72,7 +68,7 @@ public class YoutubeService extends StreamingService { public String getBaseUrl() { return "https://youtube.com"; } - + @Override public LinkHandlerFactory getStreamLHFactory() { return YoutubeStreamLinkHandlerFactory.getInstance(); @@ -147,6 +143,12 @@ public class YoutubeService extends StreamingService { return new YoutubeSubscriptionExtractor(this); } + @Nonnull + @Override + public FeedExtractor getFeedExtractor(final String channelUrl) throws ExtractionException { + return new YoutubeFeedExtractor(this, getChannelLHFactory().fromUrl(channelUrl)); + } + @Override public ListLinkHandlerFactory getCommentsLHFactory() { return YoutubeCommentsLinkHandlerFactory.getInstance(); diff --git a/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/extractors/YoutubeChannelExtractor.java b/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/extractors/YoutubeChannelExtractor.java index adc4705e4..d675cb255 100644 --- a/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/extractors/YoutubeChannelExtractor.java +++ b/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/extractors/YoutubeChannelExtractor.java @@ -46,7 +46,6 @@ import java.io.IOException; @SuppressWarnings("WeakerAccess") public class YoutubeChannelExtractor extends ChannelExtractor { /*package-private*/ static final String CHANNEL_URL_BASE = "https://www.youtube.com/channel/"; - private static final String CHANNEL_FEED_BASE = "https://www.youtube.com/feeds/videos.xml?channel_id="; private static final String CHANNEL_URL_PARAMETERS = "/videos?view=0&flow=list&sort=dd&live_view=10000"; private Document doc; @@ -130,7 +129,7 @@ public class YoutubeChannelExtractor extends ChannelExtractor { @Override public String getFeedUrl() throws ParsingException { try { - return CHANNEL_FEED_BASE + getId(); + return YoutubeParsingHelper.getFeedUrlFrom(getId()); } catch (Exception e) { throw new ParsingException("Could not get feed url", e); } diff --git a/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/extractors/YoutubeFeedExtractor.java b/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/extractors/YoutubeFeedExtractor.java new file mode 100644 index 000000000..6ed91fe4f --- /dev/null +++ b/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/extractors/YoutubeFeedExtractor.java @@ -0,0 +1,82 @@ +package org.schabi.newpipe.extractor.services.youtube.extractors; + +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; +import org.schabi.newpipe.extractor.ListExtractor; +import org.schabi.newpipe.extractor.StreamingService; +import org.schabi.newpipe.extractor.downloader.Downloader; +import org.schabi.newpipe.extractor.downloader.Response; +import org.schabi.newpipe.extractor.exceptions.ExtractionException; +import org.schabi.newpipe.extractor.feed.FeedExtractor; +import org.schabi.newpipe.extractor.linkhandler.ListLinkHandler; +import org.schabi.newpipe.extractor.services.youtube.linkHandler.YoutubeParsingHelper; +import org.schabi.newpipe.extractor.stream.StreamInfoItem; +import org.schabi.newpipe.extractor.stream.StreamInfoItemsCollector; + +import javax.annotation.Nonnull; +import java.io.IOException; + +public class YoutubeFeedExtractor extends FeedExtractor { + public YoutubeFeedExtractor(StreamingService service, ListLinkHandler linkHandler) { + super(service, linkHandler); + } + + private Document document; + + @Override + public void onFetchPage(@Nonnull Downloader downloader) throws IOException, ExtractionException { + final String channelIdOrUser = getLinkHandler().getId(); + final String feedUrl = YoutubeParsingHelper.getFeedUrlFrom(channelIdOrUser); + + final Response response = downloader.get(feedUrl); + document = Jsoup.parse(response.responseBody()); + } + + @Nonnull + @Override + public ListExtractor.InfoItemsPage getInitialPage() { + final Elements entries = document.select("feed > entry"); + final StreamInfoItemsCollector collector = new StreamInfoItemsCollector(getServiceId()); + + for (Element entryElement : entries) { + collector.commit(new YoutubeFeedInfoItemExtractor(entryElement)); + } + + return new InfoItemsPage<>(collector, null); + } + + @Nonnull + @Override + public String getId() { + return document.getElementsByTag("yt:channelId").first().text(); + } + + @Nonnull + @Override + public String getUrl() { + return document.select("feed > author > uri").first().text(); + } + + @Nonnull + @Override + public String getName() { + return document.select("feed > author > name").first().text(); + } + + @Override + public String getNextPageUrl() { + return null; + } + + @Override + public InfoItemsPage getPage(String pageUrl) { + return null; + } + + @Override + public boolean hasNextPage() { + return false; + } +} diff --git a/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/extractors/YoutubeFeedInfoItemExtractor.java b/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/extractors/YoutubeFeedInfoItemExtractor.java new file mode 100644 index 000000000..aadc80223 --- /dev/null +++ b/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/extractors/YoutubeFeedInfoItemExtractor.java @@ -0,0 +1,94 @@ +package org.schabi.newpipe.extractor.services.youtube.extractors; + +import org.jsoup.nodes.Element; +import org.schabi.newpipe.extractor.exceptions.ParsingException; +import org.schabi.newpipe.extractor.localization.DateWrapper; +import org.schabi.newpipe.extractor.stream.StreamInfoItemExtractor; +import org.schabi.newpipe.extractor.stream.StreamType; + +import javax.annotation.Nullable; +import java.text.ParseException; +import java.text.SimpleDateFormat; +import java.util.Calendar; +import java.util.Date; +import java.util.TimeZone; + +public class YoutubeFeedInfoItemExtractor implements StreamInfoItemExtractor { + private final Element entryElement; + + public YoutubeFeedInfoItemExtractor(Element entryElement) { + this.entryElement = entryElement; + } + + @Override + public StreamType getStreamType() { + // It is not possible to determine the stream type using the feed endpoint. + // All entries are considered a video stream. + return StreamType.VIDEO_STREAM; + } + + @Override + public boolean isAd() { + return false; + } + + @Override + public long getDuration() { + // Not available when fetching through the feed endpoint. + return -1; + } + + @Override + public long getViewCount() { + return Long.parseLong(entryElement.getElementsByTag("media:statistics").first().attr("views")); + } + + @Override + public String getUploaderName() { + return entryElement.select("author > name").first().text(); + } + + @Override + public String getUploaderUrl() { + return entryElement.select("author > uri").first().text(); + } + + @Nullable + @Override + public String getTextualUploadDate() { + return entryElement.getElementsByTag("published").first().text(); + } + + @Nullable + @Override + public DateWrapper getUploadDate() throws ParsingException { + final Date date; + try { + final SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss+00:00"); + dateFormat.setTimeZone(TimeZone.getTimeZone("UTC")); + date = dateFormat.parse(getTextualUploadDate()); + } catch (ParseException e) { + throw new ParsingException("Could not parse date (\"" + getTextualUploadDate() + "\")", e); + } + + final Calendar calendar = Calendar.getInstance(); + calendar.setTime(date); + + return new DateWrapper(calendar); + } + + @Override + public String getName() { + return entryElement.getElementsByTag("title").first().text(); + } + + @Override + public String getUrl() { + return entryElement.getElementsByTag("link").first().attr("href"); + } + + @Override + public String getThumbnailUrl() { + return entryElement.getElementsByTag("media:thumbnail").first().attr("url"); + } +} diff --git a/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/linkHandler/YoutubeParsingHelper.java b/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/linkHandler/YoutubeParsingHelper.java index 0e76ddf67..65ec7e3f6 100644 --- a/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/linkHandler/YoutubeParsingHelper.java +++ b/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/linkHandler/YoutubeParsingHelper.java @@ -38,6 +38,9 @@ public class YoutubeParsingHelper { private YoutubeParsingHelper() { } + private static final String FEED_BASE_CHANNEL_ID = "https://www.youtube.com/feeds/videos.xml?channel_id="; + private static final String FEED_BASE_USER = "https://www.youtube.com/feeds/videos.xml?user="; + private static final String[] RECAPTCHA_DETECTION_SELECTORS = { "form[action*=\"/das_captcha\"]", "input[name*=\"action_recaptcha_verify\"]" @@ -118,6 +121,16 @@ public class YoutubeParsingHelper { + Long.parseLong(seconds); } + public static String getFeedUrlFrom(final String channelIdOrUser) { + if (channelIdOrUser.startsWith("user/")) { + return FEED_BASE_USER + channelIdOrUser.replace("user/", ""); + } else if (channelIdOrUser.startsWith("channel/")) { + return FEED_BASE_CHANNEL_ID + channelIdOrUser.replace("channel/", ""); + } else { + return FEED_BASE_CHANNEL_ID + channelIdOrUser; + } + } + public static Calendar parseDateFrom(String textualUploadDate) throws ParsingException { Date date; try { diff --git a/extractor/src/test/java/org/schabi/newpipe/extractor/services/youtube/YoutubeFeedExtractorTest.java b/extractor/src/test/java/org/schabi/newpipe/extractor/services/youtube/YoutubeFeedExtractorTest.java new file mode 100644 index 000000000..c7f1b1c1d --- /dev/null +++ b/extractor/src/test/java/org/schabi/newpipe/extractor/services/youtube/YoutubeFeedExtractorTest.java @@ -0,0 +1,72 @@ +package org.schabi.newpipe.extractor.services.youtube; + +import org.junit.BeforeClass; +import org.junit.Test; +import org.schabi.newpipe.DownloaderTestImpl; +import org.schabi.newpipe.extractor.NewPipe; +import org.schabi.newpipe.extractor.exceptions.ParsingException; +import org.schabi.newpipe.extractor.services.BaseListExtractorTest; +import org.schabi.newpipe.extractor.services.youtube.extractors.YoutubeFeedExtractor; + +import static org.junit.Assert.*; +import static org.schabi.newpipe.extractor.ServiceList.YouTube; +import static org.schabi.newpipe.extractor.services.DefaultTests.defaultTestRelatedItems; + +public class YoutubeFeedExtractorTest { + public static class Kurzgesagt implements BaseListExtractorTest { + private static YoutubeFeedExtractor extractor; + + @BeforeClass + public static void setUp() throws Exception { + NewPipe.init(DownloaderTestImpl.getInstance()); + extractor = (YoutubeFeedExtractor) YouTube + .getFeedExtractor("https://www.youtube.com/user/Kurzgesagt"); + extractor.fetchPage(); + } + + /*////////////////////////////////////////////////////////////////////////// + // Extractor + //////////////////////////////////////////////////////////////////////////*/ + + @Test + public void testServiceId() { + assertEquals(YouTube.getServiceId(), extractor.getServiceId()); + } + + @Test + public void testName() { + String name = extractor.getName(); + assertTrue(name, name.startsWith("Kurzgesagt")); + } + + @Test + public void testId() { + assertEquals("UCsXVk37bltHxD1rDPwtNM8Q", extractor.getId()); + } + + @Test + public void testUrl() { + assertEquals("https://www.youtube.com/channel/UCsXVk37bltHxD1rDPwtNM8Q", extractor.getUrl()); + } + + @Test + public void testOriginalUrl() throws ParsingException { + assertEquals("https://www.youtube.com/user/Kurzgesagt", extractor.getOriginalUrl()); + } + + /*////////////////////////////////////////////////////////////////////////// + // ListExtractor + //////////////////////////////////////////////////////////////////////////*/ + + @Test + public void testRelatedItems() throws Exception { + defaultTestRelatedItems(extractor, YouTube.getServiceId()); + } + + @Test + public void testMoreRelatedItems() { + assertFalse(extractor.hasNextPage()); + assertNull(extractor.getNextPageUrl()); + } + } +} \ No newline at end of file