Rework link handlers to correctly accept external websites

This commit is contained in:
Fynn Godau 2020-12-05 15:08:26 +01:00
parent be562b8436
commit 04dd3d4d32
7 changed files with 63 additions and 22 deletions

View File

@ -123,6 +123,28 @@ public class BandcampExtractorHelper {
return "https://f4.bcbits.com/img/" + (album ? 'a' : "") + id + "_10.jpg";
}
/**
* @return <code>true</code> if the given url looks like it comes from a bandcamp custom domain
* or if it comes from bandcamp.com itself
*/
public static boolean isSupportedDomain(final String url) throws ParsingException {
// Accept all bandcamp.com URLs
if (url.toLowerCase().matches("https?://.+\\.bandcamp\\.com(/.*)?")) return true;
try {
// Accept all other URLs if they contain a <meta> tag that says they are generated by bandcamp
return Jsoup.parse(
NewPipe.getDownloader().get(url).responseBody()
)
.getElementsByAttributeValue("name", "generator")
.attr("content").equals("Bandcamp");
} catch (IOException | ReCaptchaException e) {
throw new ParsingException("Could not determine whether URL is custom domain " +
"(not available? network error?)");
}
}
static DateWrapper parseDate(final String textDate) throws ParsingException {
try {
final Date date = new SimpleDateFormat("dd MMM yyyy HH:mm:ss zzz", Locale.ENGLISH).parse(textDate);

View File

@ -24,7 +24,7 @@ public class BandcampChannelLinkHandlerFactory extends ListLinkHandlerFactory {
try {
final String response = NewPipe.getDownloader().get(url).responseBody();
// This variable contains band data!
// Use band data embedded in website to extract ID
final JsonObject bandData = BandcampExtractorHelper.getJsonData(response, "data-band");
return String.valueOf(bandData.getLong("id"));
@ -51,17 +51,15 @@ public class BandcampChannelLinkHandlerFactory extends ListLinkHandlerFactory {
}
/**
* Matches <code>* .bandcamp.com</code> as well as custom domains
* where the profile is at <code>* . * /releases</code>
* Accepts only pages that do not lead to an album or track. Supports external pages.
*/
@Override
public boolean onAcceptUrl(final String url) {
public boolean onAcceptUrl(final String url) throws ParsingException {
// Is a subdomain of bandcamp.com?
boolean isBandcampComArtistPage = url.matches("https?://.+\\.bandcamp\\.com/?");
// Exclude URLs that lead to a track or album
if (url.matches(".*/(album|track)/.*")) return false;
boolean isCustomDomainReleases = url.matches("https?://.+\\..+/releases/?(?!.)");
return isBandcampComArtistPage || isCustomDomainReleases;
// Test whether domain is supported
return BandcampExtractorHelper.isSupportedDomain(url);
}
}

View File

@ -4,6 +4,7 @@ package org.schabi.newpipe.extractor.services.bandcamp.linkHandler;
import org.schabi.newpipe.extractor.exceptions.ParsingException;
import org.schabi.newpipe.extractor.linkhandler.ListLinkHandlerFactory;
import org.schabi.newpipe.extractor.services.bandcamp.extractors.BandcampExtractorHelper;
import java.util.List;
@ -22,8 +23,16 @@ public class BandcampPlaylistLinkHandlerFactory extends ListLinkHandlerFactory {
return url;
}
/**
* Accepts all bandcamp URLs that contain /album/ behind their domain name.
*/
@Override
public boolean onAcceptUrl(final String url) {
return url.toLowerCase().matches("https?://.+\\..+/album/.+");
public boolean onAcceptUrl(final String url) throws ParsingException {
// Exclude URLs which do not lead to an album
if (!url.toLowerCase().matches("https?://.+\\..+/album/.+")) return false;
// Test whether domain is supported
return BandcampExtractorHelper.isSupportedDomain(url);
}
}

View File

@ -4,6 +4,7 @@ package org.schabi.newpipe.extractor.services.bandcamp.linkHandler;
import org.schabi.newpipe.extractor.exceptions.ParsingException;
import org.schabi.newpipe.extractor.linkhandler.LinkHandlerFactory;
import org.schabi.newpipe.extractor.services.bandcamp.extractors.BandcampExtractorHelper;
/**
* <p>Tracks don't have standalone ids, they are always in combination with the band id.
@ -40,16 +41,19 @@ public class BandcampStreamLinkHandlerFactory extends LinkHandlerFactory {
}
/**
* Sometimes, the root page of an artist is also an album or track
* page. In that case, it is assumed that one actually wants to open
* the profile and not the track it has set as the default one.
* <p>Urls are expected to be in this format to account for
* custom domains:</p>
* <code>https:// * . * /track/ *</code>
* Accepts URLs that point to a bandcamp radio show or that are a bandcamp
* domain and point to a track.
*/
@Override
public boolean onAcceptUrl(final String url) {
return url.toLowerCase().matches("https?://.+\\..+/track/.+")
|| url.toLowerCase().matches("https?://bandcamp\\.com/\\?show=\\d+");
public boolean onAcceptUrl(final String url) throws ParsingException {
// Accept Bandcamp radio
if (url.toLowerCase().matches("https?://bandcamp\\.com/\\?show=\\d+")) return true;
// Don't accept URLs that don't point to a track
if (!url.toLowerCase().matches("https?://.+\\..+/track/.+")) return false;
// Test whether domain is supported
return BandcampExtractorHelper.isSupportedDomain(url);
}
}

View File

@ -26,13 +26,19 @@ public class BandcampChannelLinkHandlerFactoryTest {
@Test
public void testAcceptUrl() throws ParsingException {
assertTrue(linkHandler.acceptUrl("http://interovgm.com/releases/"));
assertTrue(linkHandler.acceptUrl("https://interovgm.com/releases"));
// Bandcamp URLs
assertTrue(linkHandler.acceptUrl("http://zachbenson.bandcamp.com"));
assertTrue(linkHandler.acceptUrl("https://zachbenson.bandcamp.com/"));
assertTrue(linkHandler.acceptUrl("https://billwurtz.bandcamp.com/releases"));
assertFalse(linkHandler.acceptUrl("https://bandcamp.com"));
assertFalse(linkHandler.acceptUrl("https://zachbenson.bandcamp.com/track/kitchen"));
// External URLs
assertTrue(linkHandler.acceptUrl("http://interovgm.com/releases/"));
assertTrue(linkHandler.acceptUrl("https://interovgm.com/releases"));
assertFalse(linkHandler.acceptUrl("https://example.com/releases"));
}
@Test

View File

@ -35,6 +35,7 @@ public class BandcampPlaylistLinkHandlerFactoryTest {
assertFalse(linkHandler.acceptUrl("https://zachbenson.bandcamp.com/"));
assertFalse(linkHandler.acceptUrl("https://zachbenson.bandcamp.com/track/kitchen"));
assertFalse(linkHandler.acceptUrl("https://interovgm.com/track/title"));
assertFalse(linkHandler.acceptUrl("https://example.com/album/samplealbum"));
assertTrue(linkHandler.acceptUrl("https://powertothequeerkids.bandcamp.com/album/power-to-the-queer-kids"));
assertTrue(linkHandler.acceptUrl("https://zachbenson.bandcamp.com/album/prom"));

View File

@ -43,6 +43,7 @@ public class BandcampStreamLinkHandlerFactoryTest {
assertFalse(linkHandler.acceptUrl("https://bandcamp.com"));
assertFalse(linkHandler.acceptUrl("https://zachbenson.bandcamp.com/"));
assertFalse(linkHandler.acceptUrl("https://powertothequeerkids.bandcamp.com/album/power-to-the-queer-kids"));
assertFalse(linkHandler.acceptUrl("https://example.com/track/sampletrack"));
assertTrue(linkHandler.acceptUrl("https://zachbenson.bandcamp.com/track/kitchen"));
assertTrue(linkHandler.acceptUrl("http://ZachBenson.Bandcamp.COM/Track/U-I-Tonite/"));