From 2e57a8f24f268e091f7d1a4ff5335d2f417956a4 Mon Sep 17 00:00:00 2001 From: Fynn Godau Date: Sun, 14 Mar 2021 09:48:22 +0100 Subject: [PATCH] [Bandcamp] Fix link handler acceptance behaviour * Test for bandcamp footer instead of meta tag (which is not present on all pages) * Accept links to /music, not just /releases * Correctly handle uppercase URLs --- .../extractors/BandcampExtractorHelper.java | 17 +++++++++-------- .../BandcampChannelLinkHandlerFactory.java | 10 +++++++--- .../BandcampChannelLinkHandlerFactoryTest.java | 5 +++++ 3 files changed, 21 insertions(+), 11 deletions(-) diff --git a/extractor/src/main/java/org/schabi/newpipe/extractor/services/bandcamp/extractors/BandcampExtractorHelper.java b/extractor/src/main/java/org/schabi/newpipe/extractor/services/bandcamp/extractors/BandcampExtractorHelper.java index 547a0356e..970cddc9e 100644 --- a/extractor/src/main/java/org/schabi/newpipe/extractor/services/bandcamp/extractors/BandcampExtractorHelper.java +++ b/extractor/src/main/java/org/schabi/newpipe/extractor/services/bandcamp/extractors/BandcampExtractorHelper.java @@ -7,20 +7,15 @@ import com.grack.nanojson.JsonParser; import com.grack.nanojson.JsonParserException; import com.grack.nanojson.JsonWriter; import org.jsoup.Jsoup; -import org.jsoup.nodes.Document; import org.schabi.newpipe.extractor.NewPipe; import org.schabi.newpipe.extractor.exceptions.ParsingException; import org.schabi.newpipe.extractor.exceptions.ReCaptchaException; import org.schabi.newpipe.extractor.localization.DateWrapper; -import org.schabi.newpipe.extractor.utils.Utils; import java.io.IOException; import java.time.DateTimeException; import java.time.ZonedDateTime; import java.time.format.DateTimeFormatter; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; import java.util.Locale; public class BandcampExtractorHelper { @@ -95,12 +90,18 @@ public class BandcampExtractorHelper { if (url.toLowerCase().matches("https?://.+\\.bandcamp\\.com(/.*)?")) return true; try { - // Accept all other URLs if they contain a tag that says they are generated by bandcamp + // Test other URLs for whether they contain a footer that links to bandcamp return Jsoup.parse( NewPipe.getDownloader().get(url).responseBody() ) - .getElementsByAttributeValue("name", "generator") - .attr("content").equals("Bandcamp"); + .getElementById("pgFt") + .getElementById("pgFt-inner") + .getElementById("footer-logo-wrapper") + .getElementById("footer-logo") + .getElementsByClass("hiddenAccess") + .text().equals("Bandcamp"); + } catch (NullPointerException e) { + return false; } catch (IOException | ReCaptchaException e) { throw new ParsingException("Could not determine whether URL is custom domain " + "(not available? network error?)"); diff --git a/extractor/src/main/java/org/schabi/newpipe/extractor/services/bandcamp/linkHandler/BandcampChannelLinkHandlerFactory.java b/extractor/src/main/java/org/schabi/newpipe/extractor/services/bandcamp/linkHandler/BandcampChannelLinkHandlerFactory.java index 727aec404..05caf1141 100644 --- a/extractor/src/main/java/org/schabi/newpipe/extractor/services/bandcamp/linkHandler/BandcampChannelLinkHandlerFactory.java +++ b/extractor/src/main/java/org/schabi/newpipe/extractor/services/bandcamp/linkHandler/BandcampChannelLinkHandlerFactory.java @@ -55,7 +55,9 @@ public class BandcampChannelLinkHandlerFactory extends ListLinkHandlerFactory { * Accepts only pages that lead to the root of an artist profile. Supports external pages. */ @Override - public boolean onAcceptUrl(final String url) throws ParsingException { + public boolean onAcceptUrl(String url) throws ParsingException { + + url = url.toLowerCase(); // https: | | artist.bandcamp.com | releases // 0 1 2 3 @@ -64,8 +66,10 @@ public class BandcampChannelLinkHandlerFactory extends ListLinkHandlerFactory { // URL is too short if (splitUrl.length < 3) return false; - // Must have "releases" as segment after url or none at all - if (splitUrl.length > 3 && !splitUrl[3].equals("releases")) { + // Must have "releases" or "music" as segment after url or none at all + if (splitUrl.length > 3 && !( + splitUrl[3].equals("releases") || splitUrl[3].equals("music") + )) { return false; diff --git a/extractor/src/test/java/org/schabi/newpipe/extractor/services/bandcamp/BandcampChannelLinkHandlerFactoryTest.java b/extractor/src/test/java/org/schabi/newpipe/extractor/services/bandcamp/BandcampChannelLinkHandlerFactoryTest.java index 14c54c5aa..85c182734 100644 --- a/extractor/src/test/java/org/schabi/newpipe/extractor/services/bandcamp/BandcampChannelLinkHandlerFactoryTest.java +++ b/extractor/src/test/java/org/schabi/newpipe/extractor/services/bandcamp/BandcampChannelLinkHandlerFactoryTest.java @@ -37,12 +37,16 @@ public class BandcampChannelLinkHandlerFactoryTest { assertFalse(linkHandler.acceptUrl("https://bandcamp.com")); assertFalse(linkHandler.acceptUrl("https://zachbenson.bandcamp.com/track/kitchen")); assertFalse(linkHandler.acceptUrl("https://daily.bandcamp.com/")); + assertFalse(linkHandler.acceptUrl("https://DAILY.BANDCAMP.COM")); assertFalse(linkHandler.acceptUrl("https://daily.bandcamp.com/best-of-2020/bandcamp-daily-staffers-on-their-favorite-albums-of-2020")); // External URLs assertTrue(linkHandler.acceptUrl("https://lobstertheremin.com")); assertTrue(linkHandler.acceptUrl("https://lobstertheremin.com/music")); assertTrue(linkHandler.acceptUrl("https://lobstertheremin.com/music/")); + assertTrue(linkHandler.acceptUrl("https://diskak.usopop.com/")); + assertTrue(linkHandler.acceptUrl("https://diskak.usopop.com/releases")); + assertTrue(linkHandler.acceptUrl("https://diskak.usopop.com/RELEASES")); assertFalse(linkHandler.acceptUrl("https://example.com/releases")); } @@ -57,6 +61,7 @@ public class BandcampChannelLinkHandlerFactoryTest { assertEquals("2735462545", linkHandler.getId("http://lobstertheremin.com/")); assertEquals("2735462545", linkHandler.getId("https://lobstertheremin.com/music/")); + assertEquals("3826445168", linkHandler.getId("https://diskak.usopop.com/releases")); } @Test