Solve merge conflict

2020-10-12 08:08:52 +02:00 · 2020-10-12 08:08:52 +02:00 · c66afd6485
commit c66afd6485
parent 8efae82302
5 changed files with 142 additions and 45 deletions
--- a/app/routes.py
+++ b/app/routes.py
@ -454,8 +454,27 @@ def get_live_urls(urls):
 def watch():
    id = request.args.get('v', None)
    info = ytwatch.extract_info(id, False, playlist_id=None, index=None)
+<<<<<<< Updated upstream
+<<<<<<< Updated upstream
    # Use nginx
    best_formats = ["22", "18", "34", "35", "36", "37", "38", "43", "44", "45", "46"]
+=======
+=======
+>>>>>>> Stashed changes
+    vsources = ytwatch.get_video_sources(info, False)
+
+    # Retry 3 times if no sources are available.
+    retry = 3
+    while retry != 0 and len(vsources) == 0:
+        vsources = ytwatch.get_video_sources(info, False)
+        retry -= 1
+
+    for source in vsources:
+        hostName = urllib.parse.urlparse(source['src']).netloc
+        source['src'] = source['src'].replace("https://{}".format(hostName), "") + "&host=" + hostName
+
+    # Parse video formats
+>>>>>>> Stashed changes
    for v_format in info['formats']:
        hostName = urllib.parse.urlparse(v_format['url']).netloc
        v_format['url'] = v_format['url'].replace("https://{}".format(hostName), "") + "&host=" + hostName
--- a/requirements.txt
+++ b/requirements.txt
@ -38,6 +38,14 @@ packaging==20.4
 pylint==2.6.0
 PyMySQL==0.10.1
 pyparsing==2.4.7
+<<<<<<< Updated upstream
+=======
+PySocks==1.7.1
+python-anticaptcha==0.7.1
+<<<<<<< Updated upstream
+>>>>>>> Stashed changes
+=======
+>>>>>>> Stashed changes
 python-dateutil==2.8.1
 python-dotenv==0.14.0
 python-editor==1.0.4
--- a/yotter-config.json
+++ b/yotter-config.json
@ -11,5 +11,6 @@
 	"admin_message":"Message from the admin text",
 	"admin_user":"admin_username",
 	"max_old_user_days": 60,
-	"donate_url": ""
+	"donate_url": "",
+	"anticaptcha": "cf4bb53a6b87f973be8c0c976c390342"
 }
--- a/youtube/util.py
+++ b/youtube/util.py
@ -1,9 +1,13 @@
 import gzip

+import requests
+from bs4 import BeautifulSoup
+
 from youtube import yt_data_extract

 try:
    import brotli
+
    have_brotli = True
 except ImportError:
    have_brotli = False
@ -15,7 +19,7 @@ import json
 import gevent
 import gevent.queue
 import gevent.lock
-
+from python_anticaptcha import AnticaptchaClient, NoCaptchaTaskProxylessTask
 # The trouble with the requests library: It ships its own certificate bundle via certifi
 #  instead of using the system certificate store, meaning self-signed certificates
 #  configured by the user will not work. Some draconian networks block TLS unless a corporate
@ -51,13 +55,12 @@ import urllib3.contrib.socks

 URL_ORIGIN = "/https://www.youtube.com"

-connection_pool = urllib3.PoolManager(cert_reqs = 'CERT_REQUIRED')
+connection_pool = urllib3.PoolManager(cert_reqs='CERT_REQUIRED')

-def get_pool(use_tor):
-    return connection_pool

 class HTTPAsymmetricCookieProcessor(urllib.request.BaseHandler):
    '''Separate cookiejars for receiving and sending'''
+
    def __init__(self, cookiejar_send=None, cookiejar_receive=None):
        self.cookiejar_send = cookiejar_send
        self.cookiejar_receive = cookiejar_receive
@ -75,6 +78,7 @@ class HTTPAsymmetricCookieProcessor(urllib.request.BaseHandler):
    https_request = http_request
    https_response = http_response

+
 class FetchError(Exception):
    def __init__(self, code, reason='', ip=None):
        Exception.__init__(self, 'HTTP error during request: ' + code + ' ' + reason)
@ -82,6 +86,7 @@ class FetchError(Exception):
        self.reason = reason
        self.ip = ip

+
 def decode_content(content, encoding_header):
    encodings = encoding_header.replace(' ', '').split(',')
    for encoding in reversed(encodings):
@ -93,6 +98,57 @@ def decode_content(content, encoding_header):
            content = gzip.decompress(content)
    return content

+
+def bypass_captcha():
+    session = requests.Session()
+    url = "https://youtube.com/watch?v=CvFH_6DNRCY&gl=US&hl=en&has_verified=1&bpctr=9999999999"
+    print("Starting python GET request...")
+    response = session.get(url)
+    print("GET successful!")
+    print("vvv COOKIES DICT vvv")
+    cookies = session.cookies.get_dict()
+    print(cookies)
+
+    inputs = {}
+    html = BeautifulSoup(str(response.text), "lxml")
+
+    # If there's a captcha and we need to solve it...
+    if html.body.find('div', attrs={'class': 'g-recaptcha'}):
+        # Get the captcha form
+        form = html.body.find('form', attrs={"action": "/das_captcha"})
+
+        # Set up form inputs for request
+        for _input in form.find_all('input'):
+            try:
+                print(_input["name"] + "  ->  " + _input["value"])
+                inputs[_input["name"]] = _input["value"]
+            except KeyError:
+                continue
+        print("\n vvv Form inputs created vvv ")
+        print(inputs)
+
+        # Get CAPTCHA keys
+        site_key = html.body.find('div', attrs={'class': 'g-recaptcha'})['data-sitekey']
+        s_value = html.body.find('input', attrs={'name': 'session_token'})['value']
+
+        # Get anti-captcha API key
+        config = json.load(open('yotter-config.json'))
+        client = AnticaptchaClient(config['anticaptcha'])
+        # Create anti-captcha Task
+        task = NoCaptchaTaskProxylessTask(url, site_key)
+        job = client.createTask(task)
+        job.join()
+
+        inputs['g-recaptcha-response'] = job.get_solution_response()
+
+        # Print POST request headers
+        print(requests.post("https://youtube.com/das_captcha", data=inputs,
+                            headers={"Content-Type": "application/x-www-form-urlencoded",
+                                     "Accept-Language": "en-US,en;q=0.5",
+                                     "Referer": "https://www.youtube.com/das_captcha",
+                                     "Origin": "https://www.youtube.com"}).headers)
+
+
 def fetch_url_response(url, headers=(), timeout=15, data=None,
                       cookiejar_send=None, cookiejar_receive=None,
                       use_tor=True, max_redirects=None):
@ -105,7 +161,7 @@ def fetch_url_response(url, headers=(), timeout=15, data=None,
    When both are set to the same object, cookies will be sent from the object,
     and response cookies will be merged into it.
    '''
-    headers = dict(headers)     # Note: Calling dict() on a dict will make a copy
+    headers = dict(headers)  # Note: Calling dict() on a dict will make a copy
    if have_brotli:
        headers['Accept-Encoding'] = 'gzip, br'
    else:
@ -124,32 +180,46 @@ def fetch_url_response(url, headers=(), timeout=15, data=None,
        elif not isinstance(data, bytes):
            data = urllib.parse.urlencode(data).encode('ascii')

-    if cookiejar_send is not None or cookiejar_receive is not None:     # Use urllib
+    if cookiejar_send is not None or cookiejar_receive is not None:  # Use urllib
        req = urllib.request.Request(url, data=data, headers=headers)

-        cookie_processor = HTTPAsymmetricCookieProcessor(cookiejar_send=cookiejar_send, cookiejar_receive=cookiejar_receive)
+        cookie_processor = HTTPAsymmetricCookieProcessor(cookiejar_send=cookiejar_send,
+                                                         cookiejar_receive=cookiejar_receive)
        opener = urllib.request.build_opener(cookie_processor)

        response = opener.open(req, timeout=timeout)
        cleanup_func = (lambda r: None)

-    else:           # Use a urllib3 pool. Cookies can't be used since urllib3 doesn't have easy support for them.
+    else:  # Use a urllib3 pool. Cookies can't be used since urllib3 doesn't have easy support for them.
        # default: Retry.DEFAULT = Retry(3)
        # (in connectionpool.py in urllib3)
        # According to the documentation for urlopen, a redirect counts as a
        # retry. So there are 3 redirects max by default.
+        print("Testing for CAPTCHA python GET request...")
+        r = requests.get(url)
+        print("GET successful!")
+
+        html = BeautifulSoup(str(r.text), "lxml")
+        # If there's a captcha and we need to solve it...
+        if html.body.find('div', attrs={'class': 'g-recaptcha'}):
+            print("ReCaptcha detected! Trying to bypass it.")
+            bypass_captcha()
+
        if max_redirects:
-            retries = urllib3.Retry(3+max_redirects, redirect=max_redirects)
+            retries = urllib3.Retry(3 + max_redirects, redirect=max_redirects)
        else:
            retries = urllib3.Retry(3)
-        pool = get_pool(use_tor)
+
+        pool = connection_pool
        response = pool.request(method, url, headers=headers,
                                timeout=timeout, preload_content=False,
                                decode_content=False, retries=retries)
+
        cleanup_func = (lambda r: r.release_conn())

    return response, cleanup_func

+
 def fetch_url(url, headers=(), timeout=15, report_text=None, data=None,
              cookiejar_send=None, cookiejar_receive=None, use_tor=True,
              debug_name=None):
@ -159,18 +229,18 @@ def fetch_url(url, headers=(), timeout=15, report_text=None, data=None,
        url, headers, timeout=timeout,
        cookiejar_send=cookiejar_send, cookiejar_receive=cookiejar_receive,
        use_tor=use_tor)
+    print(response)
    response_time = time.time()

    content = response.read()
    read_finish = time.time()
-
    cleanup_func(response)  # release_connection for urllib3

    if (response.status == 429
            and content.startswith(b'<!DOCTYPE')
            and b'Our systems have detected unusual traffic' in content):
        ip = re.search(br'IP address: ((?:[\da-f]*:)+[\da-f]+|(?:\d+\.)+\d+)',
-            content)
+                       content)
        ip = ip.group(1).decode('ascii') if ip else None
        raise FetchError('429', reason=response.reason, ip=ip)

@ -178,12 +248,14 @@ def fetch_url(url, headers=(), timeout=15, report_text=None, data=None,
        raise FetchError(str(response.status), reason=response.reason, ip=None)

    if report_text:
-        print(report_text, '    Latency:', round(response_time - start_time,3), '    Read time:', round(read_finish - response_time,3))
+        print(report_text, '    Latency:', round(response_time - start_time, 3), '    Read time:',
+              round(read_finish - response_time, 3))
    content = decode_content(content, response.getheader('Content-Encoding', default='identity'))
    return content

+
 def head(url, use_tor=False, report_text=None, max_redirects=10):
-    pool = get_pool(use_tor)
+    pool = connection_pool
    start_time = time.time()

    # default: Retry.DEFAULT = Retry(3)
@ -191,24 +263,21 @@ def head(url, use_tor=False, report_text=None, max_redirects=10):
    # According to the documentation for urlopen, a redirect counts as a retry
    # So there are 3 redirects max by default. Let's change that
    # to 10 since googlevideo redirects a lot.
-    retries = urllib3.Retry(3+max_redirects, redirect=max_redirects,
-        raise_on_redirect=False)
+    retries = urllib3.Retry(3 + max_redirects, redirect=max_redirects,
+                            raise_on_redirect=False)
    headers = {'User-Agent': 'Python-urllib'}
    response = pool.request('HEAD', url, headers=headers, retries=retries)
    if report_text:
-        print(report_text, '    Latency:', round(time.time() - start_time,3))
+        print(report_text, '    Latency:', round(time.time() - start_time, 3))
    return response

+
 mobile_user_agent = 'Mozilla/5.0 (Linux; Android 7.0; Redmi Note 4 Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Mobile Safari/537.36'
 mobile_ua = (('User-Agent', mobile_user_agent),)
 desktop_user_agent = 'Mozilla/5.0 (Windows NT 6.1; rv:52.0) Gecko/20100101 Firefox/52.0'
 desktop_ua = (('User-Agent', desktop_user_agent),)


-
-
-
-
 class RateLimitedQueue(gevent.queue.Queue):
    ''' Does initial_burst (def. 30) at first, then alternates between waiting waiting_period (def. 5) seconds and doing subsequent_bursts (def. 10) queries. After 5 seconds with nothing left in the queue, resets rate limiting. '''

@ -225,9 +294,8 @@ class RateLimitedQueue(gevent.queue.Queue):
        self.empty_start = 0
        gevent.queue.Queue.__init__(self)

-
    def get(self):
-        self.lock.acquire()     # blocks if another greenlet currently has the lock
+        self.lock.acquire()  # blocks if another greenlet currently has the lock
        if self.count_since_last_wait >= self.subsequent_bursts and self.surpassed_initial:
            gevent.sleep(self.waiting_period)
            self.count_since_last_wait = 0
@ -243,7 +311,7 @@ class RateLimitedQueue(gevent.queue.Queue):
            self.currently_empty = True
            self.empty_start = time.monotonic()

-        item = gevent.queue.Queue.get(self)     # blocks when nothing left
+        item = gevent.queue.Queue.get(self)  # blocks when nothing left

        if self.currently_empty:
            if time.monotonic() - self.empty_start >= self.waiting_period:
@ -257,7 +325,6 @@ class RateLimitedQueue(gevent.queue.Queue):
        return item


-
 def download_thumbnail(save_directory, video_id):
    url = "https://i.ytimg.com/vi/" + video_id + "/mqdefault.jpg"
    save_location = os.path.join(save_directory, video_id + ".jpg")
@ -269,26 +336,23 @@ def download_thumbnail(save_directory, video_id):
    try:
        f = open(save_location, 'wb')
    except FileNotFoundError:
-        os.makedirs(save_directory, exist_ok = True)
+        os.makedirs(save_directory, exist_ok=True)
        f = open(save_location, 'wb')
    f.write(thumbnail)
    f.close()
    return True

+
 def download_thumbnails(save_directory, ids):
    if not isinstance(ids, (list, tuple)):
        ids = list(ids)
    # only do 5 at a time
    # do the n where n is divisible by 5
    i = -1
-    for i in range(0, int(len(ids)/5) - 1 ):
-        gevent.joinall([gevent.spawn(download_thumbnail, save_directory, ids[j]) for j in range(i*5, i*5 + 5)])
+    for i in range(0, int(len(ids) / 5) - 1):
+        gevent.joinall([gevent.spawn(download_thumbnail, save_directory, ids[j]) for j in range(i * 5, i * 5 + 5)])
    # do the remainders (< 5)
-    gevent.joinall([gevent.spawn(download_thumbnail, save_directory, ids[j]) for j in range(i*5 + 5, len(ids))])
-
-
-
-
+    gevent.joinall([gevent.spawn(download_thumbnail, save_directory, ids[j]) for j in range(i * 5 + 5, len(ids))])


 def dict_add(*dicts):
@ -296,6 +360,7 @@ def dict_add(*dicts):
        dicts[0].update(dictionary)
    return dicts[0]

+
 def video_id(url):
    url_parts = urllib.parse.urlparse(url)
    return urllib.parse.parse_qs(url_parts.query)['v'][0]
@ -304,11 +369,12 @@ def video_id(url):
 # default, sddefault, mqdefault, hqdefault, hq720
 def get_thumbnail_url(video_id):
    return "/i.ytimg.com/vi/" + video_id + "/mqdefault.jpg"
-    
+
+
 def seconds_to_timestamp(seconds):
    seconds = int(seconds)
-    hours, seconds = divmod(seconds,3600)
-    minutes, seconds = divmod(seconds,60)
+    hours, seconds = divmod(seconds, 3600)
+    minutes, seconds = divmod(seconds, 60)
    if hours != 0:
        timestamp = str(hours) + ":"
        timestamp += str(minutes).zfill(2)  # zfill pads with zeros
@ -319,31 +385,32 @@ def seconds_to_timestamp(seconds):
    return timestamp


-
 def update_query_string(query_string, items):
    parameters = urllib.parse.parse_qs(query_string)
    parameters.update(items)
    return urllib.parse.urlencode(parameters, doseq=True)


-
 def uppercase_escape(s):
-     return re.sub(
-         r'\\U([0-9a-fA-F]{8})',
-         lambda m: chr(int(m.group(1), base=16)), s)
+    return re.sub(
+        r'\\U([0-9a-fA-F]{8})',
+        lambda m: chr(int(m.group(1), base=16)), s)
+

 def prefix_url(url):
    if url is None:
        return None
-    url = url.lstrip('/')     # some urls have // before them, which has a special meaning
+    url = url.lstrip('/')  # some urls have // before them, which has a special meaning
    return '/' + url

+
 def left_remove(string, substring):
    '''removes substring from the start of string, if present'''
    if string.startswith(substring):
        return string[len(substring):]
    return string

+
 def concat_or_none(*strings):
    '''Concatenates strings. Returns None if any of the arguments are None'''
    result = ''
@ -365,6 +432,7 @@ def prefix_urls(item):
    except KeyError:
        pass

+
 def add_extra_html_info(item):
    if item['type'] == 'video':
        item['url'] = (URL_ORIGIN + '/watch?v=' + item['id']) if item.get('id') else None
@ -383,6 +451,7 @@ def add_extra_html_info(item):
    elif item['type'] == 'channel':
        item['url'] = (URL_ORIGIN + "/channel/" + item['id']) if item.get('id') else None

+
 def parse_info_prepare_for_html(renderer, additional_info={}):
    item = yt_data_extract.extract_item_info(renderer, additional_info)
    prefix_urls(item)
@ -390,8 +459,8 @@ def parse_info_prepare_for_html(renderer, additional_info={}):

    return item

+
 def check_gevent_exceptions(*tasks):
    for task in tasks:
        if task.exception:
            raise task.exception
-
--- a/youtube/watch.py
+++ b/youtube/watch.py
@ -148,7 +148,7 @@ headers = (
 def extract_info(video_id, use_invidious, playlist_id=None, index=None):
    # bpctr=9999999999 will bypass are-you-sure dialogs for controversial
    # videos
-    url = 'https://m.youtube.com/watch?v=' + video_id + '&pbj=1&bpctr=9999999999'
+    url = 'https://m.youtube.com/watch?v=' + video_id + '&gl=US&hl=en&has_verified=1&pbj=1&bpctr=9999999999'
    if playlist_id:
        url += '&list=' + playlist_id
    if index: