diff --git a/app/routes.py b/app/routes.py
index b4c8b73..3ebafe0 100644
--- a/app/routes.py
+++ b/app/routes.py
@@ -429,8 +429,7 @@ def channel(id):
if sort is None:
sort = 3
- data = ytch.get_channel_tab_info(id, page, sort)
-
+ data = ytch.get_channel_tab(id, page, sort)
for video in data['items']:
if config['isInstance']:
hostName = urllib.parse.urlparse(video['thumbnail'][1:]).netloc
diff --git a/yotter-config.json b/yotter-config.json
index 7ae8e0b..19c969f 100644
--- a/yotter-config.json
+++ b/yotter-config.json
@@ -1,7 +1,7 @@
{
"serverName": "yotter.xyz",
- "nitterInstance": "https://nitter.net/",
- "maxInstanceUsers": 120,
+ "nitterInstance": "https://nitter.mastodont.cat/",
+ "maxInstanceUsers": 200,
"serverLocation": "Germany",
"restrictPublicUsage":true,
"isInstance":true,
diff --git a/youtube/channel.py b/youtube/channel.py
index 8c79773..5986e42 100644
--- a/youtube/channel.py
+++ b/youtube/channel.py
@@ -105,25 +105,36 @@ def channel_ctoken_v1(channel_id, page, sort, tab, view=1):
return base64.urlsafe_b64encode(pointless_nest).decode('ascii')
-def get_channel_tab_info(channel_id, page="1", sort=3, tab='videos', view=1, print_status=True):
+def get_channel_tab(channel_id, page="1", sort=3, tab='videos', view=1,
+ ctoken=None, print_status=True):
message = 'Got channel tab' if print_status else None
- if int(sort) == 2 and int(page) > 1:
- ctoken = channel_ctoken_v1(channel_id, page, sort, tab, view)
- ctoken = ctoken.replace('=', '%3D')
- url = ('https://www.youtube.com/channel/' + channel_id + '/' + tab
- + '?action_continuation=1&continuation=' + ctoken
- + '&pbj=1')
- content = util.fetch_url(url, headers_desktop + real_cookie,
- debug_name='channel_tab', report_text=message)
- else:
+ if not ctoken:
ctoken = channel_ctoken_v3(channel_id, page, sort, tab, view)
ctoken = ctoken.replace('=', '%3D')
- url = 'https://www.youtube.com/browse_ajax?ctoken=' + ctoken
- content = util.fetch_url(url,
- headers_desktop + generic_cookie,
- debug_name='channel_tab', report_text=message)
+ # Not sure what the purpose of the key is or whether it will change
+ # For now it seems to be constant for the API endpoint, not dependent
+ # on the browsing session or channel
+ key = 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8'
+ url = 'https://www.youtube.com/youtubei/v1/browse?key=' + key
+
+ data = {
+ 'context': {
+ 'client': {
+ 'hl': 'en',
+ 'gl': 'US',
+ 'clientName': 'WEB',
+ 'clientVersion': '2.20180830',
+ },
+ },
+ 'continuation': ctoken,
+ }
+
+ content_type_header = (('Content-Type', 'application/json'),)
+ content = util.fetch_url(
+ url, headers_desktop + content_type_header,
+ data=json.dumps(data), debug_name='channel_tab', report_text=message)
info = yt_data_extract.extract_channel_info(json.loads(content), tab)
if info['error'] is not None:
return False
@@ -174,12 +185,31 @@ def get_number_of_videos_general(base_url):
return get_number_of_videos_channel(get_channel_id(base_url))
def get_channel_search_json(channel_id, query, page):
- params = proto.string(2, 'search') + proto.string(15, str(page))
+ offset = proto.unpadded_b64encode(proto.uint(3, (page-1)*30))
+ params = proto.string(2, 'search') + proto.string(15, offset)
params = proto.percent_b64encode(params)
ctoken = proto.string(2, channel_id) + proto.string(3, params) + proto.string(11, query)
ctoken = base64.urlsafe_b64encode(proto.nested(80226972, ctoken)).decode('ascii')
- polymer_json = util.fetch_url("https://www.youtube.com/browse_ajax?ctoken=" + ctoken, headers_desktop, debug_name='channel_search')
+ key = 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8'
+ url = 'https://www.youtube.com/youtubei/v1/browse?key=' + key
+
+ data = {
+ 'context': {
+ 'client': {
+ 'hl': 'en',
+ 'gl': 'US',
+ 'clientName': 'WEB',
+ 'clientVersion': '2.20180830',
+ },
+ },
+ 'continuation': ctoken,
+ }
+
+ content_type_header = (('Content-Type', 'application/json'),)
+ polymer_json = util.fetch_url(
+ url, headers_desktop + content_type_header,
+ data=json.dumps(data), debug_name='channel_search')
return polymer_json
diff --git a/youtube/util.py b/youtube/util.py
index 4df21c1..95e7d22 100644
--- a/youtube/util.py
+++ b/youtube/util.py
@@ -120,9 +120,9 @@ def fetch_url_response(url, headers=(), timeout=15, data=None,
if data is not None:
method = "POST"
if isinstance(data, str):
- data = data.encode('ascii')
+ data = data.encode('utf-8')
elif not isinstance(data, bytes):
- data = urllib.parse.urlencode(data).encode('ascii')
+ data = urllib.parse.urlencode(data).encode('utf-8')
if cookiejar_send is not None or cookiejar_receive is not None: # Use urllib
req = urllib.request.Request(url, data=data, headers=headers)
@@ -143,7 +143,7 @@ def fetch_url_response(url, headers=(), timeout=15, data=None,
else:
retries = urllib3.Retry(3)
pool = get_pool(use_tor)
- response = pool.request(method, url, headers=headers,
+ response = pool.request(method, url, headers=headers, body=data,
timeout=timeout, preload_content=False,
decode_content=False, retries=retries)
cleanup_func = (lambda r: r.release_conn())
@@ -156,7 +156,7 @@ def fetch_url(url, headers=(), timeout=15, report_text=None, data=None,
start_time = time.time()
response, cleanup_func = fetch_url_response(
- url, headers, timeout=timeout,
+ url, headers, timeout=timeout, data=data,
cookiejar_send=cookiejar_send, cookiejar_receive=cookiejar_receive,
use_tor=use_tor)
response_time = time.time()
diff --git a/youtube/yt_data_extract/common.py b/youtube/yt_data_extract/common.py
index 2d3b637..e234bc7 100644
--- a/youtube/yt_data_extract/common.py
+++ b/youtube/yt_data_extract/common.py
@@ -290,7 +290,7 @@ def extract_item_info(item, additional_info={}):
info['duration'] = extract_str(item.get('lengthText'))
# if it's an item in a playlist, get its index
- if 'index' in item: # url has wrong index on playlist page
+ if 'index' in item: # url has wrong index on playlist page
info['index'] = extract_int(item.get('index'))
elif 'indexText' in item:
# Current item in playlist has ▶ instead of the actual index, must
@@ -329,6 +329,11 @@ def extract_item_info(item, additional_info={}):
def extract_response(polymer_json):
'''return response, error'''
+ # /youtubei/v1/browse endpoint returns response directly
+ if isinstance(polymer_json, dict) and 'responseContext' in polymer_json:
+ # this is the response
+ return polymer_json, None
+
response = multi_deep_get(polymer_json, [1, 'response'], ['response'])
if response is None:
return None, 'Failed to extract response'
diff --git a/youtube/yt_data_extract/watch_extraction.py b/youtube/yt_data_extract/watch_extraction.py
index 340a367..db53581 100644
--- a/youtube/yt_data_extract/watch_extraction.py
+++ b/youtube/yt_data_extract/watch_extraction.py
@@ -172,14 +172,13 @@ def _extract_watch_info_mobile(top_level):
else:
info['playlist'] = {}
info['playlist']['title'] = playlist.get('title')
- info['playlist']['author'] = extract_str(multi_get(playlist,
+ info['playlist']['author'] = extract_str(multi_get(playlist,
'ownerName', 'longBylineText', 'shortBylineText', 'ownerText'))
author_id = deep_get(playlist, 'longBylineText', 'runs', 0,
'navigationEndpoint', 'browseEndpoint', 'browseId')
info['playlist']['author_id'] = author_id
- if author_id:
- info['playlist']['author_url'] = concat_or_none(
- 'https://www.youtube.com/channel/', author_id)
+ info['playlist']['author_url'] = concat_or_none(
+ 'https://www.youtube.com/channel/', author_id)
info['playlist']['id'] = playlist.get('playlistId')
info['playlist']['url'] = concat_or_none(
'https://www.youtube.com/playlist?list=',
@@ -447,7 +446,8 @@ def _extract_playability_error(info, player_response, error_prefix=''):
SUBTITLE_FORMATS = ('srv1', 'srv2', 'srv3', 'ttml', 'vtt')
def extract_watch_info(polymer_json):
- info = {'playability_error': None, 'error': None}
+ info = {'playability_error': None, 'error': None,
+ 'player_response_missing': None}
if isinstance(polymer_json, dict):
top_level = polymer_json
@@ -509,6 +509,10 @@ def extract_watch_info(polymer_json):
if not info['formats']:
_extract_formats(info, player_response)
+ # see https://github.com/user234683/youtube-local/issues/22#issuecomment-706395160
+ info['player_urls_missing'] = (
+ not info['formats'] and not embedded_player_response)
+
# playability errors
_extract_playability_error(info, player_response)
@@ -565,6 +569,84 @@ def extract_watch_info(polymer_json):
info['author_url'] = 'https://www.youtube.com/channel/' + info['author_id'] if info['author_id'] else None
return info
+single_char_codes = {
+ 'n': '\n',
+ '\\': '\\',
+ '"': '"',
+ "'": "'",
+ 'b': '\b',
+ 'f': '\f',
+ 'n': '\n',
+ 'r': '\r',
+ 't': '\t',
+ 'v': '\x0b',
+ '0': '\x00',
+ '\n': '', # backslash followed by literal newline joins lines
+}
+def js_escape_replace(match):
+ r'''Resolves javascript string escape sequences such as \x..'''
+ # some js-strings in the watch page html include them for no reason
+ # https://mathiasbynens.be/notes/javascript-escapes
+ escaped_sequence = match.group(1)
+ if escaped_sequence[0] in ('x', 'u'):
+ return chr(int(escaped_sequence[1:], base=16))
+
+ # In javascript, if it's not one of those escape codes, it's just the
+ # literal character. e.g., "\a" = "a"
+ return single_char_codes.get(escaped_sequence, escaped_sequence)
+
+# works but complicated and unsafe:
+#PLAYER_RESPONSE_RE = re.compile(r'')
+INITIAL_DATA_RE = re.compile(r"