From 98f49852d79712b4c839c92e7f75c8094315cb6c Mon Sep 17 00:00:00 2001 From: Connectety-W Date: Sun, 13 Jan 2019 12:52:07 +0100 Subject: [PATCH] refactored YouTube-linkHandler to use less regex and more URL-methods --- .../YoutubeChannelLinkHandlerFactory.java | 48 ++++- .../linkHandler/YoutubeParsingHelper.java | 38 ++++ .../YoutubePlaylistLinkHandlerFactory.java | 35 +++- .../YoutubeStreamLinkHandlerFactory.java | 180 +++++++++++------- .../YoutubeTrendingLinkHandlerFactory.java | 13 +- .../schabi/newpipe/extractor/utils/Utils.java | 43 ++++- .../YoutubeStreamLinkHandlerFactoryTest.java | 12 +- 7 files changed, 277 insertions(+), 92 deletions(-) diff --git a/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/linkHandler/YoutubeChannelLinkHandlerFactory.java b/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/linkHandler/YoutubeChannelLinkHandlerFactory.java index 950bab2b9..da207b278 100644 --- a/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/linkHandler/YoutubeChannelLinkHandlerFactory.java +++ b/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/linkHandler/YoutubeChannelLinkHandlerFactory.java @@ -1,9 +1,9 @@ package org.schabi.newpipe.extractor.services.youtube.linkHandler; -import org.schabi.newpipe.extractor.linkhandler.ListLinkHandlerFactory; import org.schabi.newpipe.extractor.exceptions.ParsingException; -import org.schabi.newpipe.extractor.utils.Parser; +import org.schabi.newpipe.extractor.linkhandler.ListLinkHandlerFactory; +import java.net.URL; import java.util.List; /* @@ -29,25 +29,53 @@ import java.util.List; public class YoutubeChannelLinkHandlerFactory extends ListLinkHandlerFactory { private static final YoutubeChannelLinkHandlerFactory instance = new YoutubeChannelLinkHandlerFactory(); - private static final String ID_PATTERN = "/(user/[A-Za-z0-9_-]*|channel/[A-Za-z0-9_-]*)"; public static YoutubeChannelLinkHandlerFactory getInstance() { return instance; } - @Override - public String getId(String url) throws ParsingException { - return Parser.matchGroup1(ID_PATTERN, url); - } - @Override public String getUrl(String id, List contentFilters, String searchFilter) { return "https://www.youtube.com/" + id; } + @Override + public String getId(String url) throws ParsingException { + try { + URL urlObj = new URL(url); + String path = urlObj.getPath(); + + if (!(YoutubeParsingHelper.isYoutubeURL(urlObj) || urlObj.getHost().equalsIgnoreCase("hooktube.com"))) { + throw new ParsingException("the URL given is not a Youtube-URL"); + } + + if (!path.startsWith("/user/") && !path.startsWith("/channel/")) { + throw new ParsingException("the URL given is neither a channel nor an user"); + } + + // remove leading "/" + path = path.substring(1); + + String[] splitPath = path.split("/"); + String id = splitPath[1]; + + if (id == null || !id.matches("[A-Za-z0-9_-]+")) { + throw new ParsingException("The given id is not a Youtube-Video-ID"); + } + + return splitPath[0] + "/" + id; + } catch (final Exception exception) { + throw new ParsingException("Error could not parse url :" + exception.getMessage(), exception); + } + } + @Override public boolean onAcceptUrl(String url) { - return (url.contains("youtube") || url.contains("youtu.be") || url.contains("hooktube.com")) - && (url.contains("/user/") || url.contains("/channel/")); + try { + getId(url); + } catch (ParsingException e) { + return false; + } + return true; } } diff --git a/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/linkHandler/YoutubeParsingHelper.java b/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/linkHandler/YoutubeParsingHelper.java index 84f1f1351..335bc5bf6 100644 --- a/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/linkHandler/YoutubeParsingHelper.java +++ b/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/linkHandler/YoutubeParsingHelper.java @@ -3,6 +3,8 @@ package org.schabi.newpipe.extractor.services.youtube.linkHandler; import org.schabi.newpipe.extractor.exceptions.ParsingException; +import java.net.URL; + /* * Created by Christian Schabesberger on 02.03.16. * @@ -28,6 +30,42 @@ public class YoutubeParsingHelper { private YoutubeParsingHelper() { } + private static boolean isHTTP(URL url) { + // make sure its http or https + String protocol = url.getProtocol(); + if (!protocol.equals("http") && !protocol.equals("https")) { + return false; + } + + boolean usesDefaultPort = url.getPort() == url.getDefaultPort(); + boolean setsNoPort = url.getPort() == -1; + + return setsNoPort || usesDefaultPort; + } + + public static boolean isYoutubeURL(URL url) { + // make sure its http or https + if (!isHTTP(url)) + return false; + + // make sure its a known youtube url + String host = url.getHost(); + return host.equalsIgnoreCase("youtube.com") || host.equalsIgnoreCase("www.youtube.com") + || host.equalsIgnoreCase("m.youtube.com"); + } + + public static boolean isYoutubeALikeURL(URL url) { + // make sure its http or https + if (!isHTTP(url)) + return false; + + // make sure its a known youtube url + String host = url.getHost(); + return host.equalsIgnoreCase("youtube.com") || host.equalsIgnoreCase("www.youtube.com") + || host.equalsIgnoreCase("m.youtube.com") || host.equalsIgnoreCase("www.youtube-nocookie.com") + || host.equalsIgnoreCase("youtu.be") || host.equalsIgnoreCase("hooktube.com"); + } + public static long parseDurationString(String input) throws ParsingException, NumberFormatException { diff --git a/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/linkHandler/YoutubePlaylistLinkHandlerFactory.java b/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/linkHandler/YoutubePlaylistLinkHandlerFactory.java index 9954634fc..91944122d 100644 --- a/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/linkHandler/YoutubePlaylistLinkHandlerFactory.java +++ b/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/linkHandler/YoutubePlaylistLinkHandlerFactory.java @@ -1,16 +1,15 @@ package org.schabi.newpipe.extractor.services.youtube.linkHandler; - -import org.schabi.newpipe.extractor.linkhandler.ListLinkHandlerFactory; import org.schabi.newpipe.extractor.exceptions.ParsingException; -import org.schabi.newpipe.extractor.utils.Parser; +import org.schabi.newpipe.extractor.linkhandler.ListLinkHandlerFactory; +import org.schabi.newpipe.extractor.utils.Utils; +import java.net.URL; import java.util.List; public class YoutubePlaylistLinkHandlerFactory extends ListLinkHandlerFactory { private static final YoutubePlaylistLinkHandlerFactory instance = new YoutubePlaylistLinkHandlerFactory(); - private static final String ID_PATTERN = "([\\-a-zA-Z0-9_]{10,})"; public static YoutubePlaylistLinkHandlerFactory getInstance() { return instance; @@ -24,17 +23,35 @@ public class YoutubePlaylistLinkHandlerFactory extends ListLinkHandlerFactory { @Override public String getId(String url) throws ParsingException { try { - return Parser.matchGroup1("list=" + ID_PATTERN, url); + URL urlObj = new URL(url); + + if (!YoutubeParsingHelper.isYoutubeURL(urlObj)) { + throw new ParsingException("the url given is not a Youtube-URL"); + } + + String listID = Utils.getQueryValue(urlObj, "list"); + + if (listID == null) { + throw new ParsingException("the url given does not include a playlist"); + } + + if (!listID.matches("[a-zA-Z0-9_-]{10,}")) { + throw new ParsingException("the list-ID given in the URL does not match the list pattern"); + } + + return listID; } catch (final Exception exception) { throw new ParsingException("Error could not parse url :" + exception.getMessage(), exception); } } - @Override public boolean onAcceptUrl(final String url) { - final boolean hasNotEmptyUrl = url != null && !url.isEmpty(); - final boolean isYoutubeDomain = hasNotEmptyUrl && (url.contains("youtube") || url.contains("youtu.be")); - return isYoutubeDomain && url.contains("list="); + try { + getId(url); + } catch (ParsingException e) { + return false; + } + return true; } } diff --git a/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/linkHandler/YoutubeStreamLinkHandlerFactory.java b/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/linkHandler/YoutubeStreamLinkHandlerFactory.java index 5d07779cf..e479e74c9 100644 --- a/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/linkHandler/YoutubeStreamLinkHandlerFactory.java +++ b/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/linkHandler/YoutubeStreamLinkHandlerFactory.java @@ -1,21 +1,14 @@ package org.schabi.newpipe.extractor.services.youtube.linkHandler; -import org.jsoup.Jsoup; -import org.jsoup.nodes.Document; -import org.jsoup.nodes.Element; -import org.schabi.newpipe.extractor.Downloader; -import org.schabi.newpipe.extractor.NewPipe; -import org.schabi.newpipe.extractor.linkhandler.LinkHandlerFactory; import org.schabi.newpipe.extractor.exceptions.FoundAdException; import org.schabi.newpipe.extractor.exceptions.ParsingException; -import org.schabi.newpipe.extractor.exceptions.ReCaptchaException; -import org.schabi.newpipe.extractor.utils.Parser; +import org.schabi.newpipe.extractor.linkhandler.LinkHandlerFactory; +import org.schabi.newpipe.extractor.utils.Utils; -import java.io.IOException; -import java.io.UnsupportedEncodingException; +import java.net.MalformedURLException; import java.net.URI; import java.net.URISyntaxException; -import java.net.URLDecoder; +import java.net.URL; /* * Created by Christian Schabesberger on 02.02.16. @@ -40,7 +33,6 @@ import java.net.URLDecoder; public class YoutubeStreamLinkHandlerFactory extends LinkHandlerFactory { private static final YoutubeStreamLinkHandlerFactory instance = new YoutubeStreamLinkHandlerFactory(); - private static final String ID_PATTERN = "([\\-a-zA-Z0-9_]{11})"; private YoutubeStreamLinkHandlerFactory() { } @@ -49,78 +41,138 @@ public class YoutubeStreamLinkHandlerFactory extends LinkHandlerFactory { return instance; } + private static String assertIsID(String id) throws ParsingException { + if (id == null || !id.matches("[a-zA-Z0-9_-]{11}")) { + throw new ParsingException("The given string is not a Youtube-Video-ID"); + } + + return id; + } + @Override public String getUrl(String id) { return "https://www.youtube.com/watch?v=" + id; } @Override - public String getId(String url) throws ParsingException, IllegalArgumentException { - if (url.isEmpty()) { - throw new IllegalArgumentException("The url parameter should not be empty"); - } + public String getId(String urlString) throws ParsingException, IllegalArgumentException { + try { + URI uri = new URI(urlString); - String lowercaseUrl = url.toLowerCase(); - if (lowercaseUrl.contains("youtube")) { - if (lowercaseUrl.contains("list=")) { - throw new ParsingException("Error no suitable url: " + url); - } - if (url.contains("attribution_link")) { - try { - String escapedQuery = Parser.matchGroup1("u=(.[^&|$]*)", url); - String query = URLDecoder.decode(escapedQuery, "UTF-8"); - return Parser.matchGroup1("v=" + ID_PATTERN, query); - } catch (UnsupportedEncodingException uee) { - throw new ParsingException("Could not parse attribution_link", uee); + if (uri.getScheme().equals("vnd.youtube")) { + String scheme = uri.getSchemeSpecificPart(); + if (scheme.startsWith("//")) { + urlString = "https:" + scheme; + } else { + return assertIsID(scheme); } } - if (url.contains("vnd.youtube")) { - return Parser.matchGroup1(ID_PATTERN, url); - } - if (url.contains("embed")) { - return Parser.matchGroup1("embed/" + ID_PATTERN, url); - } - if (url.contains("googleads")) { - throw new FoundAdException("Error found add: " + url); - } - return Parser.matchGroup1("[?&]v=" + ID_PATTERN, url); + } catch (URISyntaxException ignored) { } - if (lowercaseUrl.contains("youtu.be")) { - if (lowercaseUrl.contains("list=")) { - throw new ParsingException("Error no suitable url: " + url); - } - if (url.contains("v=")) { - return Parser.matchGroup1("v=" + ID_PATTERN, url); - } - return Parser.matchGroup1("[Yy][Oo][Uu][Tt][Uu]\\.[Bb][Ee]/" + ID_PATTERN, url); + + URL url; + try { + url = new URL(urlString); + } catch (MalformedURLException e) { + throw new IllegalArgumentException("The given URL is not valid"); } - if (lowercaseUrl.contains("hooktube")) { - if (lowercaseUrl.contains("&v=") - || lowercaseUrl.contains("?v=")) { - return Parser.matchGroup1("[?&]v=" + ID_PATTERN, url); + + String host = url.getHost(); + String path = url.getPath(); + // remove leading "/" of URL-path if URL-path is given + if (!path.isEmpty()) { + path = path.substring(1); + } + + if (!YoutubeParsingHelper.isYoutubeALikeURL(url)) { + if (host.equalsIgnoreCase("googleads.g.doubleclick.net")) { + throw new FoundAdException("Error found ad: " + urlString); } - if (url.contains("/embed/")) { - return Parser.matchGroup1("embed/" + ID_PATTERN, url); + + throw new ParsingException("The url is not a Youtube-URL"); + } + + if (YoutubePlaylistLinkHandlerFactory.getInstance().acceptUrl(urlString)) { + throw new ParsingException("Error no suitable url: " + urlString); + } + + // using uppercase instead of lowercase, because toLowercase replaces some unicode characters + // with their lowercase ASCII equivalent. Using toLowercase could result in faultily matching unicode urls. + switch (host.toUpperCase()) { + case "WWW.YOUTUBE-NOCOOKIE.COM": { + if (path.startsWith("embed/")) { + String id = path.split("/")[1]; + + return assertIsID(id); + } } - if (url.contains("/v/")) { - return Parser.matchGroup1("v/" + ID_PATTERN, url); + + case "YOUTUBE.COM": + case "WWW.YOUTUBE.COM": + case "M.YOUTUBE.COM": { + if (path.equals("attribution_link")) { + String uQueryValue = Utils.getQueryValue(url, "u"); + + URL decodedURL; + try { + decodedURL = new URL("http://www.youtube.com" + uQueryValue); + } catch (MalformedURLException e) { + throw new ParsingException("Error no suitable url: " + urlString); + } + + String viewQueryValue = Utils.getQueryValue(decodedURL, "v"); + return assertIsID(viewQueryValue); + } + + if (path.startsWith("embed/")) { + String id = path.split("/")[1]; + + return assertIsID(id); + } + + String viewQueryValue = Utils.getQueryValue(url, "v"); + return assertIsID(viewQueryValue); } - if (url.contains("/watch/")) { - return Parser.matchGroup1("watch/" + ID_PATTERN, url); + + case "YOUTU.BE": { + String viewQueryValue = Utils.getQueryValue(url, "v"); + if (viewQueryValue != null) { + return assertIsID(viewQueryValue); + } + + return assertIsID(path); + } + + case "HOOKTUBE.COM": { + if (path.equals("watch")) { + String viewQueryValue = Utils.getQueryValue(url, "v"); + if (viewQueryValue != null) { + return assertIsID(viewQueryValue); + } + } + if (path.startsWith("embed/")) { + String id = path.substring("embed/".length()); + + return assertIsID(id); + } + if (path.startsWith("v/")) { + String id = path.substring("v/".length()); + + return assertIsID(id); + } + if (path.startsWith("watch/")) { + String id = path.substring("watch/".length()); + + return assertIsID(id); + } } } - throw new ParsingException("Error no suitable url: " + url); + + throw new ParsingException("Error no suitable url: " + urlString); } @Override public boolean onAcceptUrl(final String url) throws FoundAdException { - final String lowercaseUrl = url.toLowerCase(); - if (!lowercaseUrl.contains("youtube") && - !lowercaseUrl.contains("youtu.be") && - !lowercaseUrl.contains("hooktube")) { - return false; - // bad programming I know <-- nice meme - } try { getId(url); return true; diff --git a/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/linkHandler/YoutubeTrendingLinkHandlerFactory.java b/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/linkHandler/YoutubeTrendingLinkHandlerFactory.java index e61693b08..123da9d1a 100644 --- a/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/linkHandler/YoutubeTrendingLinkHandlerFactory.java +++ b/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/linkHandler/YoutubeTrendingLinkHandlerFactory.java @@ -21,8 +21,9 @@ package org.schabi.newpipe.extractor.services.youtube.linkHandler; */ import org.schabi.newpipe.extractor.linkhandler.ListLinkHandlerFactory; -import org.schabi.newpipe.extractor.utils.Parser; +import java.net.MalformedURLException; +import java.net.URL; import java.util.List; public class YoutubeTrendingLinkHandlerFactory extends ListLinkHandlerFactory { @@ -38,6 +39,14 @@ public class YoutubeTrendingLinkHandlerFactory extends ListLinkHandlerFactory { @Override public boolean onAcceptUrl(final String url) { - return Parser.isMatch("^(https://|http://|)(www.|m.|)youtube.com/feed/trending(|\\?.*)$", url); + URL urlObj; + try { + urlObj = new URL(url); + } catch (MalformedURLException e) { + return false; + } + + String urlPath = urlObj.getPath(); + return YoutubeParsingHelper.isYoutubeURL(urlObj) && urlPath.equals("/feed/trending"); } } diff --git a/extractor/src/main/java/org/schabi/newpipe/extractor/utils/Utils.java b/extractor/src/main/java/org/schabi/newpipe/extractor/utils/Utils.java index 663fd093b..65f3ce24c 100644 --- a/extractor/src/main/java/org/schabi/newpipe/extractor/utils/Utils.java +++ b/extractor/src/main/java/org/schabi/newpipe/extractor/utils/Utils.java @@ -2,6 +2,9 @@ package org.schabi.newpipe.extractor.utils; import org.schabi.newpipe.extractor.exceptions.ParsingException; +import java.io.UnsupportedEncodingException; +import java.net.URL; +import java.net.URLDecoder; import java.util.List; public class Utils { @@ -57,5 +60,43 @@ public class Utils { } return url; } -} + /** + * get the value of a URL-query by name. + * if a url-query is give multiple times, only the value of the first query is returned + * + * @param url the url to be used + * @param parameterName the pattern that will be used to check the url + * @return a string that contains the value of the query parameter or null if nothing was found + */ + public static String getQueryValue(URL url, String parameterName) { + String urlQuery = url.getQuery(); + + if (urlQuery != null) { + for (String param : urlQuery.split("&")) { + String[] params = param.split("=", 2); + + String query; + try { + query = URLDecoder.decode(params[0], "UTF-8"); + } catch (UnsupportedEncodingException e) { + System.err.println("Cannot decode string with UTF-8. using the string without decoding"); + e.printStackTrace(); + query = params[0]; + } + + if (query.equals(parameterName)) { + try { + return URLDecoder.decode(params[1], "UTF-8"); + } catch (UnsupportedEncodingException e) { + System.err.println("Cannot decode string with UTF-8. using the string without decoding"); + e.printStackTrace(); + return params[1]; + } + } + } + } + + return null; + } +} \ No newline at end of file diff --git a/extractor/src/test/java/org/schabi/newpipe/extractor/services/youtube/YoutubeStreamLinkHandlerFactoryTest.java b/extractor/src/test/java/org/schabi/newpipe/extractor/services/youtube/YoutubeStreamLinkHandlerFactoryTest.java index 519eb0efb..f06ad319d 100644 --- a/extractor/src/test/java/org/schabi/newpipe/extractor/services/youtube/YoutubeStreamLinkHandlerFactoryTest.java +++ b/extractor/src/test/java/org/schabi/newpipe/extractor/services/youtube/YoutubeStreamLinkHandlerFactoryTest.java @@ -60,9 +60,9 @@ public class YoutubeStreamLinkHandlerFactoryTest { public void getIdfromYt() throws Exception { assertEquals("jZViOEv90dI", linkHandler.fromUrl("https://www.youtube.com/watch?v=jZViOEv90dI").getId()); assertEquals("W-fFHeTX70Q", linkHandler.fromUrl("https://www.youtube.com/watch?v=W-fFHeTX70Q").getId()); - assertEquals("jZViOEv90dI", linkHandler.fromUrl("https://www.youtube.com/watch?v=jZViOEv90dI?t=100").getId()); - assertEquals("jZViOEv90dI", linkHandler.fromUrl("https://WWW.YouTube.com/watch?v=jZViOEv90dI?t=100").getId()); - assertEquals("jZViOEv90dI", linkHandler.fromUrl("HTTPS://www.youtube.com/watch?v=jZViOEv90dI?t=100").getId()); + assertEquals("jZViOEv90dI", linkHandler.fromUrl("https://www.youtube.com/watch?v=jZViOEv90dI&t=100").getId()); + assertEquals("jZViOEv90dI", linkHandler.fromUrl("https://WWW.YouTube.com/watch?v=jZViOEv90dI&t=100").getId()); + assertEquals("jZViOEv90dI", linkHandler.fromUrl("HTTPS://www.youtube.com/watch?v=jZViOEv90dI&t=100").getId()); assertEquals("jZViOEv90dI", linkHandler.fromUrl("https://youtu.be/jZViOEv90dI?t=9s").getId()); assertEquals("jZViOEv90dI", linkHandler.fromUrl("HTTPS://Youtu.be/jZViOEv90dI?t=9s").getId()); assertEquals("uEJuoEs1UxY", linkHandler.fromUrl("http://www.youtube.com/watch_popup?v=uEJuoEs1UxY").getId()); @@ -85,9 +85,9 @@ public class YoutubeStreamLinkHandlerFactoryTest { @Test public void testAcceptYtUrl() throws ParsingException { assertTrue(linkHandler.acceptUrl("https://www.youtube.com/watch?v=jZViOEv90dI")); - assertTrue(linkHandler.acceptUrl("https://www.youtube.com/watch?v=jZViOEv90dI?t=100")); - assertTrue(linkHandler.acceptUrl("https://WWW.YouTube.com/watch?v=jZViOEv90dI?t=100")); - assertTrue(linkHandler.acceptUrl("HTTPS://www.youtube.com/watch?v=jZViOEv90dI?t=100")); + assertTrue(linkHandler.acceptUrl("https://www.youtube.com/watch?v=jZViOEv90dI&t=100")); + assertTrue(linkHandler.acceptUrl("https://WWW.YouTube.com/watch?v=jZViOEv90dI&t=100")); + assertTrue(linkHandler.acceptUrl("HTTPS://www.youtube.com/watch?v=jZViOEv90dI&t=100")); assertTrue(linkHandler.acceptUrl("https://youtu.be/jZViOEv90dI?t=9s")); assertTrue(linkHandler.acceptUrl("https://www.youtube.com/embed/jZViOEv90dI")); assertTrue(linkHandler.acceptUrl("https://www.youtube-nocookie.com/embed/jZViOEv90dI"));