From d0aa476f70b0a10a886589176ab09e1e26ecadaa Mon Sep 17 00:00:00 2001
From: pluja <github_pluja@r3d.red>
Date: Fri, 12 Mar 2021 19:50:13 +0100
Subject: [PATCH] Fix #195 and #193

---
 app/routes.py                               |   3 +-
 yotter-config.json                          |   4 +-
 youtube/channel.py                          |  62 ++++++++---
 youtube/util.py                             |   8 +-
 youtube/yt_data_extract/common.py           |   7 +-
 youtube/yt_data_extract/watch_extraction.py | 112 +++++++++++++++++---
 6 files changed, 158 insertions(+), 38 deletions(-)

diff --git a/app/routes.py b/app/routes.py
index b4c8b73..3ebafe0 100644
--- a/app/routes.py
+++ b/app/routes.py
@@ -429,8 +429,7 @@ def channel(id):
     if sort is None:
         sort = 3
 
-    data = ytch.get_channel_tab_info(id, page, sort)
-
+    data = ytch.get_channel_tab(id, page, sort)
     for video in data['items']:
         if config['isInstance']:
             hostName = urllib.parse.urlparse(video['thumbnail'][1:]).netloc
diff --git a/yotter-config.json b/yotter-config.json
index 7ae8e0b..19c969f 100644
--- a/yotter-config.json
+++ b/yotter-config.json
@@ -1,7 +1,7 @@
 {
 	"serverName": "yotter.xyz",
-	"nitterInstance": "https://nitter.net/",
-	"maxInstanceUsers": 120,
+	"nitterInstance": "https://nitter.mastodont.cat/",
+	"maxInstanceUsers": 200,
 	"serverLocation": "Germany",
 	"restrictPublicUsage":true,
 	"isInstance":true,
diff --git a/youtube/channel.py b/youtube/channel.py
index 8c79773..5986e42 100644
--- a/youtube/channel.py
+++ b/youtube/channel.py
@@ -105,25 +105,36 @@ def channel_ctoken_v1(channel_id, page, sort, tab, view=1):
 
     return base64.urlsafe_b64encode(pointless_nest).decode('ascii')
 
-def get_channel_tab_info(channel_id, page="1", sort=3, tab='videos', view=1, print_status=True):
+def get_channel_tab(channel_id, page="1", sort=3, tab='videos', view=1,
+                    ctoken=None, print_status=True):
     message = 'Got channel tab' if print_status else None
 
-    if int(sort) == 2 and int(page) > 1:
-        ctoken = channel_ctoken_v1(channel_id, page, sort, tab, view)
-        ctoken = ctoken.replace('=', '%3D')
-        url = ('https://www.youtube.com/channel/' + channel_id + '/' + tab
-            + '?action_continuation=1&continuation=' + ctoken
-            + '&pbj=1')
-        content = util.fetch_url(url, headers_desktop + real_cookie,
-            debug_name='channel_tab', report_text=message)
-    else:
+    if not ctoken:
         ctoken = channel_ctoken_v3(channel_id, page, sort, tab, view)
         ctoken = ctoken.replace('=', '%3D')
-        url = 'https://www.youtube.com/browse_ajax?ctoken=' + ctoken
-        content = util.fetch_url(url,
-            headers_desktop + generic_cookie,
-            debug_name='channel_tab', report_text=message)
 
+    # Not sure what the purpose of the key is or whether it will change
+    # For now it seems to be constant for the API endpoint, not dependent
+    # on the browsing session or channel
+    key = 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8'
+    url = 'https://www.youtube.com/youtubei/v1/browse?key=' + key
+
+    data = {
+        'context': {
+            'client': {
+                'hl': 'en',
+                'gl': 'US',
+                'clientName': 'WEB',
+                'clientVersion': '2.20180830',
+            },
+        },
+        'continuation': ctoken,
+    }
+
+    content_type_header = (('Content-Type', 'application/json'),)
+    content = util.fetch_url(
+        url, headers_desktop + content_type_header,
+        data=json.dumps(data), debug_name='channel_tab', report_text=message)
     info = yt_data_extract.extract_channel_info(json.loads(content), tab)
     if info['error'] is not None:
         return False
@@ -174,12 +185,31 @@ def get_number_of_videos_general(base_url):
     return get_number_of_videos_channel(get_channel_id(base_url))
 
 def get_channel_search_json(channel_id, query, page):
-    params = proto.string(2, 'search') + proto.string(15, str(page))
+    offset = proto.unpadded_b64encode(proto.uint(3, (page-1)*30))
+    params = proto.string(2, 'search') + proto.string(15, offset)
     params = proto.percent_b64encode(params)
     ctoken = proto.string(2, channel_id) + proto.string(3, params) + proto.string(11, query)
     ctoken = base64.urlsafe_b64encode(proto.nested(80226972, ctoken)).decode('ascii')
 
-    polymer_json = util.fetch_url("https://www.youtube.com/browse_ajax?ctoken=" + ctoken, headers_desktop, debug_name='channel_search')
+    key = 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8'
+    url = 'https://www.youtube.com/youtubei/v1/browse?key=' + key
+
+    data = {
+        'context': {
+            'client': {
+                'hl': 'en',
+                'gl': 'US',
+                'clientName': 'WEB',
+                'clientVersion': '2.20180830',
+            },
+        },
+        'continuation': ctoken,
+    }
+
+    content_type_header = (('Content-Type', 'application/json'),)
+    polymer_json = util.fetch_url(
+        url, headers_desktop + content_type_header,
+        data=json.dumps(data), debug_name='channel_search')
 
     return polymer_json
 
diff --git a/youtube/util.py b/youtube/util.py
index 4df21c1..95e7d22 100644
--- a/youtube/util.py
+++ b/youtube/util.py
@@ -120,9 +120,9 @@ def fetch_url_response(url, headers=(), timeout=15, data=None,
     if data is not None:
         method = "POST"
         if isinstance(data, str):
-            data = data.encode('ascii')
+            data = data.encode('utf-8')
         elif not isinstance(data, bytes):
-            data = urllib.parse.urlencode(data).encode('ascii')
+            data = urllib.parse.urlencode(data).encode('utf-8')
 
     if cookiejar_send is not None or cookiejar_receive is not None:     # Use urllib
         req = urllib.request.Request(url, data=data, headers=headers)
@@ -143,7 +143,7 @@ def fetch_url_response(url, headers=(), timeout=15, data=None,
         else:
             retries = urllib3.Retry(3)
         pool = get_pool(use_tor)
-        response = pool.request(method, url, headers=headers,
+        response = pool.request(method, url, headers=headers, body=data,
                                 timeout=timeout, preload_content=False,
                                 decode_content=False, retries=retries)
         cleanup_func = (lambda r: r.release_conn())
@@ -156,7 +156,7 @@ def fetch_url(url, headers=(), timeout=15, report_text=None, data=None,
     start_time = time.time()
 
     response, cleanup_func = fetch_url_response(
-        url, headers, timeout=timeout,
+        url, headers, timeout=timeout, data=data,
         cookiejar_send=cookiejar_send, cookiejar_receive=cookiejar_receive,
         use_tor=use_tor)
     response_time = time.time()
diff --git a/youtube/yt_data_extract/common.py b/youtube/yt_data_extract/common.py
index 2d3b637..e234bc7 100644
--- a/youtube/yt_data_extract/common.py
+++ b/youtube/yt_data_extract/common.py
@@ -290,7 +290,7 @@ def extract_item_info(item, additional_info={}):
         info['duration'] = extract_str(item.get('lengthText'))
 
         # if it's an item in a playlist, get its index
-        if 'index' in item: # url has wrong index on playlist page 
+        if 'index' in item: # url has wrong index on playlist page
             info['index'] = extract_int(item.get('index'))
         elif 'indexText' in item:
             # Current item in playlist has ▶ instead of the actual index, must
@@ -329,6 +329,11 @@ def extract_item_info(item, additional_info={}):
 
 def extract_response(polymer_json):
     '''return response, error'''
+    # /youtubei/v1/browse endpoint returns response directly
+    if isinstance(polymer_json, dict) and 'responseContext' in polymer_json:
+        # this is the response
+        return polymer_json, None
+
     response = multi_deep_get(polymer_json, [1, 'response'], ['response'])
     if response is None:
         return None, 'Failed to extract response'
diff --git a/youtube/yt_data_extract/watch_extraction.py b/youtube/yt_data_extract/watch_extraction.py
index 340a367..db53581 100644
--- a/youtube/yt_data_extract/watch_extraction.py
+++ b/youtube/yt_data_extract/watch_extraction.py
@@ -172,14 +172,13 @@ def _extract_watch_info_mobile(top_level):
         else:
             info['playlist'] = {}
             info['playlist']['title'] = playlist.get('title')
-            info['playlist']['author'] = extract_str(multi_get(playlist, 
+            info['playlist']['author'] = extract_str(multi_get(playlist,
                 'ownerName', 'longBylineText', 'shortBylineText', 'ownerText'))
             author_id = deep_get(playlist, 'longBylineText', 'runs', 0,
                 'navigationEndpoint', 'browseEndpoint', 'browseId')
             info['playlist']['author_id'] = author_id
-            if author_id:
-                info['playlist']['author_url'] = concat_or_none(
-                    'https://www.youtube.com/channel/', author_id)
+            info['playlist']['author_url'] = concat_or_none(
+                'https://www.youtube.com/channel/', author_id)
             info['playlist']['id'] = playlist.get('playlistId')
             info['playlist']['url'] = concat_or_none(
                 'https://www.youtube.com/playlist?list=',
@@ -447,7 +446,8 @@ def _extract_playability_error(info, player_response, error_prefix=''):
 
 SUBTITLE_FORMATS = ('srv1', 'srv2', 'srv3', 'ttml', 'vtt')
 def extract_watch_info(polymer_json):
-    info = {'playability_error': None, 'error': None}
+    info = {'playability_error': None, 'error': None,
+        'player_response_missing': None}
 
     if isinstance(polymer_json, dict):
         top_level = polymer_json
@@ -509,6 +509,10 @@ def extract_watch_info(polymer_json):
     if not info['formats']:
         _extract_formats(info, player_response)
 
+    # see https://github.com/user234683/youtube-local/issues/22#issuecomment-706395160
+    info['player_urls_missing'] = (
+        not info['formats'] and not embedded_player_response)
+
     # playability errors
     _extract_playability_error(info, player_response)
 
@@ -565,6 +569,84 @@ def extract_watch_info(polymer_json):
     info['author_url'] = 'https://www.youtube.com/channel/' + info['author_id'] if info['author_id'] else None
     return info
 
+single_char_codes = {
+    'n': '\n',
+    '\\': '\\',
+    '"': '"',
+    "'": "'",
+    'b': '\b',
+    'f': '\f',
+    'n': '\n',
+    'r': '\r',
+    't': '\t',
+    'v': '\x0b',
+    '0': '\x00',
+    '\n': '', # backslash followed by literal newline joins lines
+}
+def js_escape_replace(match):
+    r'''Resolves javascript string escape sequences such as \x..'''
+    # some js-strings in the watch page html include them for no reason
+    # https://mathiasbynens.be/notes/javascript-escapes
+    escaped_sequence = match.group(1)
+    if escaped_sequence[0] in ('x', 'u'):
+        return chr(int(escaped_sequence[1:], base=16))
+
+    # In javascript, if it's not one of those escape codes, it's just the
+    # literal character. e.g., "\a" = "a"
+    return single_char_codes.get(escaped_sequence, escaped_sequence)
+
+# works but complicated and unsafe:
+#PLAYER_RESPONSE_RE = re.compile(r'<script[^>]*?>[^<]*?var ytInitialPlayerResponse = ({(?:"(?:[^"\\]|\\.)*?"|[^"])+?});')
+
+# Because there are sometimes additional statements after the json object
+# so we just capture all of those until end of script and tell json decoder
+# to ignore extra stuff after the json object
+PLAYER_RESPONSE_RE = re.compile(r'<script[^>]*?>[^<]*?var ytInitialPlayerResponse = ({.*?)</script>')
+INITIAL_DATA_RE = re.compile(r"<script[^>]*?>var ytInitialData = '(.+?[^\\])';")
+BASE_JS_RE = re.compile(r'jsUrl":\s*"([\w\-\./]+?/base.js)"')
+JS_STRING_ESCAPE_RE = re.compile(r'\\([^xu]|x..|u....)')
+def extract_watch_info_from_html(watch_html):
+    base_js_match = BASE_JS_RE.search(watch_html)
+    player_response_match = PLAYER_RESPONSE_RE.search(watch_html)
+    initial_data_match = INITIAL_DATA_RE.search(watch_html)
+
+    if base_js_match is not None:
+        base_js_url = base_js_match.group(1)
+    else:
+        base_js_url = None
+
+    if player_response_match is not None:
+        decoder = json.JSONDecoder()
+        # this will make it ignore extra stuff after end of object
+        player_response = decoder.raw_decode(player_response_match.group(1))[0]
+    else:
+        return {'error': 'Could not find ytInitialPlayerResponse'}
+        player_response = None
+
+    if initial_data_match is not None:
+        initial_data = initial_data_match.group(1)
+        initial_data = JS_STRING_ESCAPE_RE.sub(js_escape_replace, initial_data)
+        initial_data = json.loads(initial_data)
+    else:
+        print('extract_watch_info_from_html: failed to find initialData')
+        initial_data = None
+
+    # imitate old format expected by extract_watch_info
+    fake_polymer_json = {
+        'player': {
+            'args': {},
+            'assets': {
+                'js': base_js_url
+            }
+        },
+        'playerResponse': player_response,
+        'response': initial_data,
+    }
+
+    return extract_watch_info(fake_polymer_json)
+
+
+
 def get_caption_url(info, language, format, automatic=False, translation_language=None):
     '''Gets the url for captions with the given language and format. If automatic is True, get the automatic captions for that language. If translation_language is given, translate the captions from `language` to `translation_language`. If automatic is true and translation_language is given, the automatic captions will be translated.'''
     url = info['_captions_base_url']
@@ -580,7 +662,8 @@ def get_caption_url(info, language, format, automatic=False, translation_languag
     return url
 
 def update_with_age_restricted_info(info, video_info_page):
-    ERROR_PREFIX = 'Error bypassing age-restriction: '
+    '''Inserts urls from 'player_response' in get_video_info page'''
+    ERROR_PREFIX = 'Error getting missing player or bypassing age-restriction: '
 
     video_info = urllib.parse.parse_qs(video_info_page)
     player_response = deep_get(video_info, 'player_response', 0)
@@ -603,7 +686,9 @@ def requires_decryption(info):
 # adapted from youtube-dl and invidious:
 # https://github.com/omarroth/invidious/blob/master/src/invidious/helpers/signatures.cr
 decrypt_function_re = re.compile(r'function\(a\)\{(a=a\.split\(""\)[^\}{]+)return a\.join\(""\)\}')
-op_with_arg_re = re.compile(r'[^\.]+\.([^\(]+)\(a,(\d+)\)')
+# gives us e.g. rt, .xK, 5 from rt.xK(a,5) or rt, ["xK"], 5 from rt["xK"](a,5)
+# (var, operation, argument)
+var_op_arg_re = re.compile(r'(\w+)(\.\w+|\["[^"]+"\])\(a,(\d+)\)')
 def extract_decryption_function(info, base_js):
     '''Insert decryption function into info. Return error string if not successful.
     Decryption function is a list of list[2] of numbers.
@@ -617,10 +702,11 @@ def extract_decryption_function(info, base_js):
     if not function_body:
         return 'Empty decryption function body'
 
-    var_name = get(function_body[0].split('.'), 0)
-    if var_name is None:
+    var_with_operation_match = var_op_arg_re.fullmatch(function_body[0])
+    if var_with_operation_match is None:
         return 'Could not find var_name'
 
+    var_name = var_with_operation_match.group(1)
     var_body_match = re.search(r'var ' + re.escape(var_name) + r'=\{(.*?)\};', base_js, flags=re.DOTALL)
     if var_body_match is None:
         return 'Could not find var_body'
@@ -649,13 +735,13 @@ def extract_decryption_function(info, base_js):
 
     decryption_function = []
     for op_with_arg in function_body:
-        match = op_with_arg_re.fullmatch(op_with_arg)
+        match = var_op_arg_re.fullmatch(op_with_arg)
         if match is None:
             return 'Could not parse operation with arg'
-        op_name = match.group(1)
+        op_name = match.group(2).strip('[].')
         if op_name not in operation_definitions:
-            return 'Unknown op_name: ' + op_name
-        op_argument = match.group(2)
+            return 'Unknown op_name: ' + str(op_name)
+        op_argument = match.group(3)
         decryption_function.append([operation_definitions[op_name], int(op_argument)])
 
     info['decryption_function'] = decryption_function