Solve merge conflict

This commit is contained in:
pluja 2020-10-12 08:08:52 +02:00
parent 8efae82302
commit c66afd6485
5 changed files with 142 additions and 45 deletions

View File

@ -454,8 +454,27 @@ def get_live_urls(urls):
def watch():
id = request.args.get('v', None)
info = ytwatch.extract_info(id, False, playlist_id=None, index=None)
<<<<<<< Updated upstream
<<<<<<< Updated upstream
# Use nginx
best_formats = ["22", "18", "34", "35", "36", "37", "38", "43", "44", "45", "46"]
=======
=======
>>>>>>> Stashed changes
vsources = ytwatch.get_video_sources(info, False)
# Retry 3 times if no sources are available.
retry = 3
while retry != 0 and len(vsources) == 0:
vsources = ytwatch.get_video_sources(info, False)
retry -= 1
for source in vsources:
hostName = urllib.parse.urlparse(source['src']).netloc
source['src'] = source['src'].replace("https://{}".format(hostName), "") + "&host=" + hostName
# Parse video formats
>>>>>>> Stashed changes
for v_format in info['formats']:
hostName = urllib.parse.urlparse(v_format['url']).netloc
v_format['url'] = v_format['url'].replace("https://{}".format(hostName), "") + "&host=" + hostName

View File

@ -38,6 +38,14 @@ packaging==20.4
pylint==2.6.0
PyMySQL==0.10.1
pyparsing==2.4.7
<<<<<<< Updated upstream
=======
PySocks==1.7.1
python-anticaptcha==0.7.1
<<<<<<< Updated upstream
>>>>>>> Stashed changes
=======
>>>>>>> Stashed changes
python-dateutil==2.8.1
python-dotenv==0.14.0
python-editor==1.0.4

View File

@ -11,5 +11,6 @@
"admin_message":"Message from the admin text",
"admin_user":"admin_username",
"max_old_user_days": 60,
"donate_url": ""
"donate_url": "",
"anticaptcha": "cf4bb53a6b87f973be8c0c976c390342"
}

View File

@ -1,9 +1,13 @@
import gzip
import requests
from bs4 import BeautifulSoup
from youtube import yt_data_extract
try:
import brotli
have_brotli = True
except ImportError:
have_brotli = False
@ -15,7 +19,7 @@ import json
import gevent
import gevent.queue
import gevent.lock
from python_anticaptcha import AnticaptchaClient, NoCaptchaTaskProxylessTask
# The trouble with the requests library: It ships its own certificate bundle via certifi
# instead of using the system certificate store, meaning self-signed certificates
# configured by the user will not work. Some draconian networks block TLS unless a corporate
@ -51,13 +55,12 @@ import urllib3.contrib.socks
URL_ORIGIN = "/https://www.youtube.com"
connection_pool = urllib3.PoolManager(cert_reqs = 'CERT_REQUIRED')
connection_pool = urllib3.PoolManager(cert_reqs='CERT_REQUIRED')
def get_pool(use_tor):
return connection_pool
class HTTPAsymmetricCookieProcessor(urllib.request.BaseHandler):
'''Separate cookiejars for receiving and sending'''
def __init__(self, cookiejar_send=None, cookiejar_receive=None):
self.cookiejar_send = cookiejar_send
self.cookiejar_receive = cookiejar_receive
@ -75,6 +78,7 @@ class HTTPAsymmetricCookieProcessor(urllib.request.BaseHandler):
https_request = http_request
https_response = http_response
class FetchError(Exception):
def __init__(self, code, reason='', ip=None):
Exception.__init__(self, 'HTTP error during request: ' + code + ' ' + reason)
@ -82,6 +86,7 @@ class FetchError(Exception):
self.reason = reason
self.ip = ip
def decode_content(content, encoding_header):
encodings = encoding_header.replace(' ', '').split(',')
for encoding in reversed(encodings):
@ -93,6 +98,57 @@ def decode_content(content, encoding_header):
content = gzip.decompress(content)
return content
def bypass_captcha():
session = requests.Session()
url = "https://youtube.com/watch?v=CvFH_6DNRCY&gl=US&hl=en&has_verified=1&bpctr=9999999999"
print("Starting python GET request...")
response = session.get(url)
print("GET successful!")
print("vvv COOKIES DICT vvv")
cookies = session.cookies.get_dict()
print(cookies)
inputs = {}
html = BeautifulSoup(str(response.text), "lxml")
# If there's a captcha and we need to solve it...
if html.body.find('div', attrs={'class': 'g-recaptcha'}):
# Get the captcha form
form = html.body.find('form', attrs={"action": "/das_captcha"})
# Set up form inputs for request
for _input in form.find_all('input'):
try:
print(_input["name"] + " -> " + _input["value"])
inputs[_input["name"]] = _input["value"]
except KeyError:
continue
print("\n vvv Form inputs created vvv ")
print(inputs)
# Get CAPTCHA keys
site_key = html.body.find('div', attrs={'class': 'g-recaptcha'})['data-sitekey']
s_value = html.body.find('input', attrs={'name': 'session_token'})['value']
# Get anti-captcha API key
config = json.load(open('yotter-config.json'))
client = AnticaptchaClient(config['anticaptcha'])
# Create anti-captcha Task
task = NoCaptchaTaskProxylessTask(url, site_key)
job = client.createTask(task)
job.join()
inputs['g-recaptcha-response'] = job.get_solution_response()
# Print POST request headers
print(requests.post("https://youtube.com/das_captcha", data=inputs,
headers={"Content-Type": "application/x-www-form-urlencoded",
"Accept-Language": "en-US,en;q=0.5",
"Referer": "https://www.youtube.com/das_captcha",
"Origin": "https://www.youtube.com"}).headers)
def fetch_url_response(url, headers=(), timeout=15, data=None,
cookiejar_send=None, cookiejar_receive=None,
use_tor=True, max_redirects=None):
@ -105,7 +161,7 @@ def fetch_url_response(url, headers=(), timeout=15, data=None,
When both are set to the same object, cookies will be sent from the object,
and response cookies will be merged into it.
'''
headers = dict(headers) # Note: Calling dict() on a dict will make a copy
headers = dict(headers) # Note: Calling dict() on a dict will make a copy
if have_brotli:
headers['Accept-Encoding'] = 'gzip, br'
else:
@ -124,32 +180,46 @@ def fetch_url_response(url, headers=(), timeout=15, data=None,
elif not isinstance(data, bytes):
data = urllib.parse.urlencode(data).encode('ascii')
if cookiejar_send is not None or cookiejar_receive is not None: # Use urllib
if cookiejar_send is not None or cookiejar_receive is not None: # Use urllib
req = urllib.request.Request(url, data=data, headers=headers)
cookie_processor = HTTPAsymmetricCookieProcessor(cookiejar_send=cookiejar_send, cookiejar_receive=cookiejar_receive)
cookie_processor = HTTPAsymmetricCookieProcessor(cookiejar_send=cookiejar_send,
cookiejar_receive=cookiejar_receive)
opener = urllib.request.build_opener(cookie_processor)
response = opener.open(req, timeout=timeout)
cleanup_func = (lambda r: None)
else: # Use a urllib3 pool. Cookies can't be used since urllib3 doesn't have easy support for them.
else: # Use a urllib3 pool. Cookies can't be used since urllib3 doesn't have easy support for them.
# default: Retry.DEFAULT = Retry(3)
# (in connectionpool.py in urllib3)
# According to the documentation for urlopen, a redirect counts as a
# retry. So there are 3 redirects max by default.
print("Testing for CAPTCHA python GET request...")
r = requests.get(url)
print("GET successful!")
html = BeautifulSoup(str(r.text), "lxml")
# If there's a captcha and we need to solve it...
if html.body.find('div', attrs={'class': 'g-recaptcha'}):
print("ReCaptcha detected! Trying to bypass it.")
bypass_captcha()
if max_redirects:
retries = urllib3.Retry(3+max_redirects, redirect=max_redirects)
retries = urllib3.Retry(3 + max_redirects, redirect=max_redirects)
else:
retries = urllib3.Retry(3)
pool = get_pool(use_tor)
pool = connection_pool
response = pool.request(method, url, headers=headers,
timeout=timeout, preload_content=False,
decode_content=False, retries=retries)
cleanup_func = (lambda r: r.release_conn())
return response, cleanup_func
def fetch_url(url, headers=(), timeout=15, report_text=None, data=None,
cookiejar_send=None, cookiejar_receive=None, use_tor=True,
debug_name=None):
@ -159,18 +229,18 @@ def fetch_url(url, headers=(), timeout=15, report_text=None, data=None,
url, headers, timeout=timeout,
cookiejar_send=cookiejar_send, cookiejar_receive=cookiejar_receive,
use_tor=use_tor)
print(response)
response_time = time.time()
content = response.read()
read_finish = time.time()
cleanup_func(response) # release_connection for urllib3
if (response.status == 429
and content.startswith(b'<!DOCTYPE')
and b'Our systems have detected unusual traffic' in content):
ip = re.search(br'IP address: ((?:[\da-f]*:)+[\da-f]+|(?:\d+\.)+\d+)',
content)
content)
ip = ip.group(1).decode('ascii') if ip else None
raise FetchError('429', reason=response.reason, ip=ip)
@ -178,12 +248,14 @@ def fetch_url(url, headers=(), timeout=15, report_text=None, data=None,
raise FetchError(str(response.status), reason=response.reason, ip=None)
if report_text:
print(report_text, ' Latency:', round(response_time - start_time,3), ' Read time:', round(read_finish - response_time,3))
print(report_text, ' Latency:', round(response_time - start_time, 3), ' Read time:',
round(read_finish - response_time, 3))
content = decode_content(content, response.getheader('Content-Encoding', default='identity'))
return content
def head(url, use_tor=False, report_text=None, max_redirects=10):
pool = get_pool(use_tor)
pool = connection_pool
start_time = time.time()
# default: Retry.DEFAULT = Retry(3)
@ -191,24 +263,21 @@ def head(url, use_tor=False, report_text=None, max_redirects=10):
# According to the documentation for urlopen, a redirect counts as a retry
# So there are 3 redirects max by default. Let's change that
# to 10 since googlevideo redirects a lot.
retries = urllib3.Retry(3+max_redirects, redirect=max_redirects,
raise_on_redirect=False)
retries = urllib3.Retry(3 + max_redirects, redirect=max_redirects,
raise_on_redirect=False)
headers = {'User-Agent': 'Python-urllib'}
response = pool.request('HEAD', url, headers=headers, retries=retries)
if report_text:
print(report_text, ' Latency:', round(time.time() - start_time,3))
print(report_text, ' Latency:', round(time.time() - start_time, 3))
return response
mobile_user_agent = 'Mozilla/5.0 (Linux; Android 7.0; Redmi Note 4 Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Mobile Safari/537.36'
mobile_ua = (('User-Agent', mobile_user_agent),)
desktop_user_agent = 'Mozilla/5.0 (Windows NT 6.1; rv:52.0) Gecko/20100101 Firefox/52.0'
desktop_ua = (('User-Agent', desktop_user_agent),)
class RateLimitedQueue(gevent.queue.Queue):
''' Does initial_burst (def. 30) at first, then alternates between waiting waiting_period (def. 5) seconds and doing subsequent_bursts (def. 10) queries. After 5 seconds with nothing left in the queue, resets rate limiting. '''
@ -225,9 +294,8 @@ class RateLimitedQueue(gevent.queue.Queue):
self.empty_start = 0
gevent.queue.Queue.__init__(self)
def get(self):
self.lock.acquire() # blocks if another greenlet currently has the lock
self.lock.acquire() # blocks if another greenlet currently has the lock
if self.count_since_last_wait >= self.subsequent_bursts and self.surpassed_initial:
gevent.sleep(self.waiting_period)
self.count_since_last_wait = 0
@ -243,7 +311,7 @@ class RateLimitedQueue(gevent.queue.Queue):
self.currently_empty = True
self.empty_start = time.monotonic()
item = gevent.queue.Queue.get(self) # blocks when nothing left
item = gevent.queue.Queue.get(self) # blocks when nothing left
if self.currently_empty:
if time.monotonic() - self.empty_start >= self.waiting_period:
@ -257,7 +325,6 @@ class RateLimitedQueue(gevent.queue.Queue):
return item
def download_thumbnail(save_directory, video_id):
url = "https://i.ytimg.com/vi/" + video_id + "/mqdefault.jpg"
save_location = os.path.join(save_directory, video_id + ".jpg")
@ -269,26 +336,23 @@ def download_thumbnail(save_directory, video_id):
try:
f = open(save_location, 'wb')
except FileNotFoundError:
os.makedirs(save_directory, exist_ok = True)
os.makedirs(save_directory, exist_ok=True)
f = open(save_location, 'wb')
f.write(thumbnail)
f.close()
return True
def download_thumbnails(save_directory, ids):
if not isinstance(ids, (list, tuple)):
ids = list(ids)
# only do 5 at a time
# do the n where n is divisible by 5
i = -1
for i in range(0, int(len(ids)/5) - 1 ):
gevent.joinall([gevent.spawn(download_thumbnail, save_directory, ids[j]) for j in range(i*5, i*5 + 5)])
for i in range(0, int(len(ids) / 5) - 1):
gevent.joinall([gevent.spawn(download_thumbnail, save_directory, ids[j]) for j in range(i * 5, i * 5 + 5)])
# do the remainders (< 5)
gevent.joinall([gevent.spawn(download_thumbnail, save_directory, ids[j]) for j in range(i*5 + 5, len(ids))])
gevent.joinall([gevent.spawn(download_thumbnail, save_directory, ids[j]) for j in range(i * 5 + 5, len(ids))])
def dict_add(*dicts):
@ -296,6 +360,7 @@ def dict_add(*dicts):
dicts[0].update(dictionary)
return dicts[0]
def video_id(url):
url_parts = urllib.parse.urlparse(url)
return urllib.parse.parse_qs(url_parts.query)['v'][0]
@ -304,11 +369,12 @@ def video_id(url):
# default, sddefault, mqdefault, hqdefault, hq720
def get_thumbnail_url(video_id):
return "/i.ytimg.com/vi/" + video_id + "/mqdefault.jpg"
def seconds_to_timestamp(seconds):
seconds = int(seconds)
hours, seconds = divmod(seconds,3600)
minutes, seconds = divmod(seconds,60)
hours, seconds = divmod(seconds, 3600)
minutes, seconds = divmod(seconds, 60)
if hours != 0:
timestamp = str(hours) + ":"
timestamp += str(minutes).zfill(2) # zfill pads with zeros
@ -319,31 +385,32 @@ def seconds_to_timestamp(seconds):
return timestamp
def update_query_string(query_string, items):
parameters = urllib.parse.parse_qs(query_string)
parameters.update(items)
return urllib.parse.urlencode(parameters, doseq=True)
def uppercase_escape(s):
return re.sub(
r'\\U([0-9a-fA-F]{8})',
lambda m: chr(int(m.group(1), base=16)), s)
return re.sub(
r'\\U([0-9a-fA-F]{8})',
lambda m: chr(int(m.group(1), base=16)), s)
def prefix_url(url):
if url is None:
return None
url = url.lstrip('/') # some urls have // before them, which has a special meaning
url = url.lstrip('/') # some urls have // before them, which has a special meaning
return '/' + url
def left_remove(string, substring):
'''removes substring from the start of string, if present'''
if string.startswith(substring):
return string[len(substring):]
return string
def concat_or_none(*strings):
'''Concatenates strings. Returns None if any of the arguments are None'''
result = ''
@ -365,6 +432,7 @@ def prefix_urls(item):
except KeyError:
pass
def add_extra_html_info(item):
if item['type'] == 'video':
item['url'] = (URL_ORIGIN + '/watch?v=' + item['id']) if item.get('id') else None
@ -383,6 +451,7 @@ def add_extra_html_info(item):
elif item['type'] == 'channel':
item['url'] = (URL_ORIGIN + "/channel/" + item['id']) if item.get('id') else None
def parse_info_prepare_for_html(renderer, additional_info={}):
item = yt_data_extract.extract_item_info(renderer, additional_info)
prefix_urls(item)
@ -390,8 +459,8 @@ def parse_info_prepare_for_html(renderer, additional_info={}):
return item
def check_gevent_exceptions(*tasks):
for task in tasks:
if task.exception:
raise task.exception

View File

@ -148,7 +148,7 @@ headers = (
def extract_info(video_id, use_invidious, playlist_id=None, index=None):
# bpctr=9999999999 will bypass are-you-sure dialogs for controversial
# videos
url = 'https://m.youtube.com/watch?v=' + video_id + '&pbj=1&bpctr=9999999999'
url = 'https://m.youtube.com/watch?v=' + video_id + '&gl=US&hl=en&has_verified=1&pbj=1&bpctr=9999999999'
if playlist_id:
url += '&list=' + playlist_id
if index: