From c66afd6485ef06e1d4123bd6cf761c957c976190 Mon Sep 17 00:00:00 2001 From: pluja Date: Mon, 12 Oct 2020 08:08:52 +0200 Subject: [PATCH] Solve merge conflict --- app/routes.py | 19 ++++++ requirements.txt | 8 +++ yotter-config.json | 3 +- youtube/util.py | 155 ++++++++++++++++++++++++++++++++------------- youtube/watch.py | 2 +- 5 files changed, 142 insertions(+), 45 deletions(-) diff --git a/app/routes.py b/app/routes.py index 9e26eaf..f7b168f 100644 --- a/app/routes.py +++ b/app/routes.py @@ -454,8 +454,27 @@ def get_live_urls(urls): def watch(): id = request.args.get('v', None) info = ytwatch.extract_info(id, False, playlist_id=None, index=None) +<<<<<<< Updated upstream +<<<<<<< Updated upstream # Use nginx best_formats = ["22", "18", "34", "35", "36", "37", "38", "43", "44", "45", "46"] +======= +======= +>>>>>>> Stashed changes + vsources = ytwatch.get_video_sources(info, False) + + # Retry 3 times if no sources are available. + retry = 3 + while retry != 0 and len(vsources) == 0: + vsources = ytwatch.get_video_sources(info, False) + retry -= 1 + + for source in vsources: + hostName = urllib.parse.urlparse(source['src']).netloc + source['src'] = source['src'].replace("https://{}".format(hostName), "") + "&host=" + hostName + + # Parse video formats +>>>>>>> Stashed changes for v_format in info['formats']: hostName = urllib.parse.urlparse(v_format['url']).netloc v_format['url'] = v_format['url'].replace("https://{}".format(hostName), "") + "&host=" + hostName diff --git a/requirements.txt b/requirements.txt index d0d7b7e..d4e34b5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -38,6 +38,14 @@ packaging==20.4 pylint==2.6.0 PyMySQL==0.10.1 pyparsing==2.4.7 +<<<<<<< Updated upstream +======= +PySocks==1.7.1 +python-anticaptcha==0.7.1 +<<<<<<< Updated upstream +>>>>>>> Stashed changes +======= +>>>>>>> Stashed changes python-dateutil==2.8.1 python-dotenv==0.14.0 python-editor==1.0.4 diff --git a/yotter-config.json b/yotter-config.json index e0ceeab..54425f7 100644 --- a/yotter-config.json +++ b/yotter-config.json @@ -11,5 +11,6 @@ "admin_message":"Message from the admin text", "admin_user":"admin_username", "max_old_user_days": 60, - "donate_url": "" + "donate_url": "", + "anticaptcha": "cf4bb53a6b87f973be8c0c976c390342" } diff --git a/youtube/util.py b/youtube/util.py index e3f6c65..4588461 100644 --- a/youtube/util.py +++ b/youtube/util.py @@ -1,9 +1,13 @@ import gzip +import requests +from bs4 import BeautifulSoup + from youtube import yt_data_extract try: import brotli + have_brotli = True except ImportError: have_brotli = False @@ -15,7 +19,7 @@ import json import gevent import gevent.queue import gevent.lock - +from python_anticaptcha import AnticaptchaClient, NoCaptchaTaskProxylessTask # The trouble with the requests library: It ships its own certificate bundle via certifi # instead of using the system certificate store, meaning self-signed certificates # configured by the user will not work. Some draconian networks block TLS unless a corporate @@ -51,13 +55,12 @@ import urllib3.contrib.socks URL_ORIGIN = "/https://www.youtube.com" -connection_pool = urllib3.PoolManager(cert_reqs = 'CERT_REQUIRED') +connection_pool = urllib3.PoolManager(cert_reqs='CERT_REQUIRED') -def get_pool(use_tor): - return connection_pool class HTTPAsymmetricCookieProcessor(urllib.request.BaseHandler): '''Separate cookiejars for receiving and sending''' + def __init__(self, cookiejar_send=None, cookiejar_receive=None): self.cookiejar_send = cookiejar_send self.cookiejar_receive = cookiejar_receive @@ -75,6 +78,7 @@ class HTTPAsymmetricCookieProcessor(urllib.request.BaseHandler): https_request = http_request https_response = http_response + class FetchError(Exception): def __init__(self, code, reason='', ip=None): Exception.__init__(self, 'HTTP error during request: ' + code + ' ' + reason) @@ -82,6 +86,7 @@ class FetchError(Exception): self.reason = reason self.ip = ip + def decode_content(content, encoding_header): encodings = encoding_header.replace(' ', '').split(',') for encoding in reversed(encodings): @@ -93,6 +98,57 @@ def decode_content(content, encoding_header): content = gzip.decompress(content) return content + +def bypass_captcha(): + session = requests.Session() + url = "https://youtube.com/watch?v=CvFH_6DNRCY&gl=US&hl=en&has_verified=1&bpctr=9999999999" + print("Starting python GET request...") + response = session.get(url) + print("GET successful!") + print("vvv COOKIES DICT vvv") + cookies = session.cookies.get_dict() + print(cookies) + + inputs = {} + html = BeautifulSoup(str(response.text), "lxml") + + # If there's a captcha and we need to solve it... + if html.body.find('div', attrs={'class': 'g-recaptcha'}): + # Get the captcha form + form = html.body.find('form', attrs={"action": "/das_captcha"}) + + # Set up form inputs for request + for _input in form.find_all('input'): + try: + print(_input["name"] + " -> " + _input["value"]) + inputs[_input["name"]] = _input["value"] + except KeyError: + continue + print("\n vvv Form inputs created vvv ") + print(inputs) + + # Get CAPTCHA keys + site_key = html.body.find('div', attrs={'class': 'g-recaptcha'})['data-sitekey'] + s_value = html.body.find('input', attrs={'name': 'session_token'})['value'] + + # Get anti-captcha API key + config = json.load(open('yotter-config.json')) + client = AnticaptchaClient(config['anticaptcha']) + # Create anti-captcha Task + task = NoCaptchaTaskProxylessTask(url, site_key) + job = client.createTask(task) + job.join() + + inputs['g-recaptcha-response'] = job.get_solution_response() + + # Print POST request headers + print(requests.post("https://youtube.com/das_captcha", data=inputs, + headers={"Content-Type": "application/x-www-form-urlencoded", + "Accept-Language": "en-US,en;q=0.5", + "Referer": "https://www.youtube.com/das_captcha", + "Origin": "https://www.youtube.com"}).headers) + + def fetch_url_response(url, headers=(), timeout=15, data=None, cookiejar_send=None, cookiejar_receive=None, use_tor=True, max_redirects=None): @@ -105,7 +161,7 @@ def fetch_url_response(url, headers=(), timeout=15, data=None, When both are set to the same object, cookies will be sent from the object, and response cookies will be merged into it. ''' - headers = dict(headers) # Note: Calling dict() on a dict will make a copy + headers = dict(headers) # Note: Calling dict() on a dict will make a copy if have_brotli: headers['Accept-Encoding'] = 'gzip, br' else: @@ -124,32 +180,46 @@ def fetch_url_response(url, headers=(), timeout=15, data=None, elif not isinstance(data, bytes): data = urllib.parse.urlencode(data).encode('ascii') - if cookiejar_send is not None or cookiejar_receive is not None: # Use urllib + if cookiejar_send is not None or cookiejar_receive is not None: # Use urllib req = urllib.request.Request(url, data=data, headers=headers) - cookie_processor = HTTPAsymmetricCookieProcessor(cookiejar_send=cookiejar_send, cookiejar_receive=cookiejar_receive) + cookie_processor = HTTPAsymmetricCookieProcessor(cookiejar_send=cookiejar_send, + cookiejar_receive=cookiejar_receive) opener = urllib.request.build_opener(cookie_processor) response = opener.open(req, timeout=timeout) cleanup_func = (lambda r: None) - else: # Use a urllib3 pool. Cookies can't be used since urllib3 doesn't have easy support for them. + else: # Use a urllib3 pool. Cookies can't be used since urllib3 doesn't have easy support for them. # default: Retry.DEFAULT = Retry(3) # (in connectionpool.py in urllib3) # According to the documentation for urlopen, a redirect counts as a # retry. So there are 3 redirects max by default. + print("Testing for CAPTCHA python GET request...") + r = requests.get(url) + print("GET successful!") + + html = BeautifulSoup(str(r.text), "lxml") + # If there's a captcha and we need to solve it... + if html.body.find('div', attrs={'class': 'g-recaptcha'}): + print("ReCaptcha detected! Trying to bypass it.") + bypass_captcha() + if max_redirects: - retries = urllib3.Retry(3+max_redirects, redirect=max_redirects) + retries = urllib3.Retry(3 + max_redirects, redirect=max_redirects) else: retries = urllib3.Retry(3) - pool = get_pool(use_tor) + + pool = connection_pool response = pool.request(method, url, headers=headers, timeout=timeout, preload_content=False, decode_content=False, retries=retries) + cleanup_func = (lambda r: r.release_conn()) return response, cleanup_func + def fetch_url(url, headers=(), timeout=15, report_text=None, data=None, cookiejar_send=None, cookiejar_receive=None, use_tor=True, debug_name=None): @@ -159,18 +229,18 @@ def fetch_url(url, headers=(), timeout=15, report_text=None, data=None, url, headers, timeout=timeout, cookiejar_send=cookiejar_send, cookiejar_receive=cookiejar_receive, use_tor=use_tor) + print(response) response_time = time.time() content = response.read() read_finish = time.time() - cleanup_func(response) # release_connection for urllib3 if (response.status == 429 and content.startswith(b'= self.subsequent_bursts and self.surpassed_initial: gevent.sleep(self.waiting_period) self.count_since_last_wait = 0 @@ -243,7 +311,7 @@ class RateLimitedQueue(gevent.queue.Queue): self.currently_empty = True self.empty_start = time.monotonic() - item = gevent.queue.Queue.get(self) # blocks when nothing left + item = gevent.queue.Queue.get(self) # blocks when nothing left if self.currently_empty: if time.monotonic() - self.empty_start >= self.waiting_period: @@ -257,7 +325,6 @@ class RateLimitedQueue(gevent.queue.Queue): return item - def download_thumbnail(save_directory, video_id): url = "https://i.ytimg.com/vi/" + video_id + "/mqdefault.jpg" save_location = os.path.join(save_directory, video_id + ".jpg") @@ -269,26 +336,23 @@ def download_thumbnail(save_directory, video_id): try: f = open(save_location, 'wb') except FileNotFoundError: - os.makedirs(save_directory, exist_ok = True) + os.makedirs(save_directory, exist_ok=True) f = open(save_location, 'wb') f.write(thumbnail) f.close() return True + def download_thumbnails(save_directory, ids): if not isinstance(ids, (list, tuple)): ids = list(ids) # only do 5 at a time # do the n where n is divisible by 5 i = -1 - for i in range(0, int(len(ids)/5) - 1 ): - gevent.joinall([gevent.spawn(download_thumbnail, save_directory, ids[j]) for j in range(i*5, i*5 + 5)]) + for i in range(0, int(len(ids) / 5) - 1): + gevent.joinall([gevent.spawn(download_thumbnail, save_directory, ids[j]) for j in range(i * 5, i * 5 + 5)]) # do the remainders (< 5) - gevent.joinall([gevent.spawn(download_thumbnail, save_directory, ids[j]) for j in range(i*5 + 5, len(ids))]) - - - - + gevent.joinall([gevent.spawn(download_thumbnail, save_directory, ids[j]) for j in range(i * 5 + 5, len(ids))]) def dict_add(*dicts): @@ -296,6 +360,7 @@ def dict_add(*dicts): dicts[0].update(dictionary) return dicts[0] + def video_id(url): url_parts = urllib.parse.urlparse(url) return urllib.parse.parse_qs(url_parts.query)['v'][0] @@ -304,11 +369,12 @@ def video_id(url): # default, sddefault, mqdefault, hqdefault, hq720 def get_thumbnail_url(video_id): return "/i.ytimg.com/vi/" + video_id + "/mqdefault.jpg" - + + def seconds_to_timestamp(seconds): seconds = int(seconds) - hours, seconds = divmod(seconds,3600) - minutes, seconds = divmod(seconds,60) + hours, seconds = divmod(seconds, 3600) + minutes, seconds = divmod(seconds, 60) if hours != 0: timestamp = str(hours) + ":" timestamp += str(minutes).zfill(2) # zfill pads with zeros @@ -319,31 +385,32 @@ def seconds_to_timestamp(seconds): return timestamp - def update_query_string(query_string, items): parameters = urllib.parse.parse_qs(query_string) parameters.update(items) return urllib.parse.urlencode(parameters, doseq=True) - def uppercase_escape(s): - return re.sub( - r'\\U([0-9a-fA-F]{8})', - lambda m: chr(int(m.group(1), base=16)), s) + return re.sub( + r'\\U([0-9a-fA-F]{8})', + lambda m: chr(int(m.group(1), base=16)), s) + def prefix_url(url): if url is None: return None - url = url.lstrip('/') # some urls have // before them, which has a special meaning + url = url.lstrip('/') # some urls have // before them, which has a special meaning return '/' + url + def left_remove(string, substring): '''removes substring from the start of string, if present''' if string.startswith(substring): return string[len(substring):] return string + def concat_or_none(*strings): '''Concatenates strings. Returns None if any of the arguments are None''' result = '' @@ -365,6 +432,7 @@ def prefix_urls(item): except KeyError: pass + def add_extra_html_info(item): if item['type'] == 'video': item['url'] = (URL_ORIGIN + '/watch?v=' + item['id']) if item.get('id') else None @@ -383,6 +451,7 @@ def add_extra_html_info(item): elif item['type'] == 'channel': item['url'] = (URL_ORIGIN + "/channel/" + item['id']) if item.get('id') else None + def parse_info_prepare_for_html(renderer, additional_info={}): item = yt_data_extract.extract_item_info(renderer, additional_info) prefix_urls(item) @@ -390,8 +459,8 @@ def parse_info_prepare_for_html(renderer, additional_info={}): return item + def check_gevent_exceptions(*tasks): for task in tasks: if task.exception: raise task.exception - diff --git a/youtube/watch.py b/youtube/watch.py index 51b220d..e5f54e8 100644 --- a/youtube/watch.py +++ b/youtube/watch.py @@ -148,7 +148,7 @@ headers = ( def extract_info(video_id, use_invidious, playlist_id=None, index=None): # bpctr=9999999999 will bypass are-you-sure dialogs for controversial # videos - url = 'https://m.youtube.com/watch?v=' + video_id + '&pbj=1&bpctr=9999999999' + url = 'https://m.youtube.com/watch?v=' + video_id + '&gl=US&hl=en&has_verified=1&pbj=1&bpctr=9999999999' if playlist_id: url += '&list=' + playlist_id if index: