First update, use youtube-local adapted 'API'
This commit is contained in:
parent
53586d0195
commit
8efae82302
575
app/routes.py
575
app/routes.py
File diff suppressed because it is too large
Load Diff
@ -1,7 +1,7 @@
|
||||
<div class="comment">
|
||||
<a class="avatar" style="width: 32px; height: 32px;"><img src="{{ comment.thumbnail }}"></a>
|
||||
<div class="content">
|
||||
{% if comment.authorIsChannelOwner %}
|
||||
{% if comment.author == info.author %}
|
||||
|
||||
<a class="author" style="color: red;" href="{{comment.channel}}"><i class="red user circle icon"></i>{{comment.author}}</a>
|
||||
{% else %}
|
||||
@ -22,9 +22,6 @@
|
||||
<i class="thumbs up icon"></i>
|
||||
{{comment.likes}}
|
||||
</div>
|
||||
{%if comment.creatorHeart != false%}
|
||||
<i class="small red heart icon"></i><img class="ui circular image" style="width: 15px; height: 15px;" src="{{comment.creatorHeart}}">
|
||||
{% endif %}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
@ -4,30 +4,30 @@
|
||||
{% extends "base.html" %}
|
||||
{% block content %}
|
||||
<div class="ui text container">
|
||||
{% if video.nginxUrl == "#" %}
|
||||
{% if info.error != None or info.playability_error != None %}
|
||||
<div class="ui center aligned text container">
|
||||
<div class="ui segment">
|
||||
<h4 class="ui header">ERROR WITH VIDEO</h4>
|
||||
</div>
|
||||
</div>
|
||||
{% elif video.isUpcoming %}
|
||||
{% elif info.playability_status != None %}
|
||||
<div class="ui center aligned text container">
|
||||
<div class="ui segment">
|
||||
<h4 class="ui header">SCHEDULED VIDEO</h4>
|
||||
<h5 class="ui header">{{video.premieres}}</h5>
|
||||
</div>
|
||||
</div>
|
||||
{% elif video.isLive %}
|
||||
{% elif info.live %}
|
||||
<div class="video-js-responsive-container vjs-hd">
|
||||
<video-js id=live width="1080" class="video-js vjs-default-skin" controls buffered>
|
||||
<video-js id=live width="1080" class="video-js vjs-default-skin" controls>
|
||||
<source
|
||||
src="{{urls[0]['url']}}"
|
||||
src="#"
|
||||
type="application/x-mpegURL">
|
||||
</video-js>
|
||||
</div>
|
||||
<div class="ui center aligned text container">
|
||||
<div class="ui segment">
|
||||
<h3 class="ui header">LIVESTREAM VIDEO</h3>
|
||||
<h3 class="ui header"><i class="red small circle icon"></i> LIVESTREAM VIDEO</h3>
|
||||
<h4 class="ui header">FEATURE AVAILABLE SOON</h4>
|
||||
<h5 class="ui header">Livestreams are under developent and still not supported on Yotter.</h5>
|
||||
</div>
|
||||
@ -41,11 +41,11 @@
|
||||
buffered
|
||||
preload="none">
|
||||
{% if config.nginxVideoStream %}
|
||||
{% for url in urls %}
|
||||
<source src="{{url.url}}" type="video/{{url.ext}}">
|
||||
{% for format in info.formats %}
|
||||
{% if format.video_valid %}
|
||||
<source src="{{format.url}}" type="video/{{format.ext}}">
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
{% else %}
|
||||
<source src="{{url_for('stream', url=video.videoUrl.replace('/', 'YotterSlash'))}}" type="video/mp4">
|
||||
{% endif %}
|
||||
</video>
|
||||
</div>
|
||||
@ -53,55 +53,54 @@
|
||||
|
||||
<div class="ui segments">
|
||||
<div class="ui segment">
|
||||
<h2 class="ui header break-word">{{video.title}}</h2>
|
||||
<h2 class="ui header break-word">{{info.title}}</h2>
|
||||
</div>
|
||||
<div class="ui horizontal segments">
|
||||
<div class="center aligned ui segment">
|
||||
<a href="{{ url_for('channel', id=video.channelId)}}">
|
||||
{%if video.author.__len__() > 8%}
|
||||
<i class="user icon"></i> {{video.author[0:10]+'...'}}
|
||||
{%else%}
|
||||
<i class="user icon"></i> {{video.author}}
|
||||
{%endif%}
|
||||
<a href="{{ url_for('channel', id=info.author_id)}}">
|
||||
<i class="user icon"></i> {{info.author}}
|
||||
</a>
|
||||
</div>
|
||||
<div class="center aligned ui segment">
|
||||
<h4 class="ui header"><i class="grey eye icon"></i>{{video.viewCount}}</h4>
|
||||
<h4 class="ui header"><i class="grey eye icon"></i>{{info.view_count}}</h4>
|
||||
</div>
|
||||
<div class="center aligned ui segment">
|
||||
{% if video.averageRating | int > 49 %}
|
||||
<h4 class="ui header"><i class="green thumbs up icon"></i> {{video.averageRating[0:4]}}%</h4>
|
||||
{% if info.rating | int > 49 %}
|
||||
<h4 class="ui header"><i class="green thumbs up icon"></i> {{info.rating}}%</h4>
|
||||
{% else %}
|
||||
<h4 class="ui header"><i class="red thumbs down icon"></i> {{video.averageRating[0:4]}}%</h4>
|
||||
<h4 class="ui header"><i class="red thumbs down icon"></i> {{info.rating}}%</h4>
|
||||
{% endif %}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="ui raised center aligned segment break-word">
|
||||
<p><i class="grey music icon"></i><b><a href="{{video.nginxAudioUrl}}">Play Only Audio</a></b></p>
|
||||
<p><i class="grey music icon"></i><b>Audio Only</b></p>
|
||||
<audio controls>
|
||||
<source src="{{video.nginxAudioUrl}}">
|
||||
Your browser does not support the audio element.
|
||||
{% for format in info.formats %}
|
||||
{% if format.audio_valid %}
|
||||
<source src="{{format.url}}">
|
||||
{%endif%}
|
||||
{%endfor%}
|
||||
No audio available.
|
||||
</audio>
|
||||
</div>
|
||||
|
||||
<div class="ui raised segment break-word">
|
||||
<p>{{video.description}}</p>
|
||||
<p>{{info.description}}</p>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
{% if comments != False %}
|
||||
<div class="ui comments">
|
||||
<h3 class="ui dividing header">Comments</h3>
|
||||
{% for comment in video.comments %}
|
||||
{% for comment in videocomments %}
|
||||
{% include '_video_comment.html' %}
|
||||
{% endfor %}
|
||||
</div>
|
||||
{%endif%}
|
||||
|
||||
<script src="{{ url_for('static',filename='video.min.js') }}"></script>
|
||||
<script src="{{ url_for('static',filename='videojs-http-streaming.min.js')}}"></script>
|
||||
{% if video.isLive %}
|
||||
{% if info.live %}
|
||||
<p>Active</p>
|
||||
<script src="{{ url_for('static',filename='videojs-http-streaming.min.js')}}"></script>
|
||||
<script>
|
||||
var player = videojs('live');
|
||||
player.play();
|
||||
|
281
youtube/channel.py
Normal file
281
youtube/channel.py
Normal file
@ -0,0 +1,281 @@
|
||||
import base64
|
||||
from youtube import util, yt_data_extract, local_playlist, subscriptions
|
||||
from youtube import yt_app
|
||||
|
||||
import urllib
|
||||
import json
|
||||
from string import Template
|
||||
import youtube.proto as proto
|
||||
import html
|
||||
import math
|
||||
import gevent
|
||||
import re
|
||||
import cachetools.func
|
||||
import traceback
|
||||
|
||||
import flask
|
||||
from flask import request
|
||||
|
||||
headers_desktop = (
|
||||
('Accept', '*/*'),
|
||||
('Accept-Language', 'en-US,en;q=0.5'),
|
||||
('X-YouTube-Client-Name', '1'),
|
||||
('X-YouTube-Client-Version', '2.20180830'),
|
||||
) + util.desktop_ua
|
||||
headers_mobile = (
|
||||
('Accept', '*/*'),
|
||||
('Accept-Language', 'en-US,en;q=0.5'),
|
||||
('X-YouTube-Client-Name', '2'),
|
||||
('X-YouTube-Client-Version', '2.20180830'),
|
||||
) + util.mobile_ua
|
||||
real_cookie = (('Cookie', 'VISITOR_INFO1_LIVE=8XihrAcN1l4'),)
|
||||
generic_cookie = (('Cookie', 'VISITOR_INFO1_LIVE=ST1Ti53r4fU'),)
|
||||
|
||||
# SORT:
|
||||
# videos:
|
||||
# Popular - 1
|
||||
# Oldest - 2
|
||||
# Newest - 3
|
||||
# playlists:
|
||||
# Oldest - 2
|
||||
# Newest - 3
|
||||
# Last video added - 4
|
||||
|
||||
# view:
|
||||
# grid: 0 or 1
|
||||
# list: 2
|
||||
def channel_ctoken_v3(channel_id, page, sort, tab, view=1):
|
||||
# page > 1 doesn't work when sorting by oldest
|
||||
offset = 30*(int(page) - 1)
|
||||
page_token = proto.string(61, proto.unpadded_b64encode(
|
||||
proto.string(1, proto.unpadded_b64encode(proto.uint(1,offset)))
|
||||
))
|
||||
|
||||
tab = proto.string(2, tab )
|
||||
sort = proto.uint(3, int(sort))
|
||||
|
||||
shelf_view = proto.uint(4, 0)
|
||||
view = proto.uint(6, int(view))
|
||||
continuation_info = proto.string(3,
|
||||
proto.percent_b64encode(tab + sort + shelf_view + view + page_token)
|
||||
)
|
||||
|
||||
channel_id = proto.string(2, channel_id )
|
||||
pointless_nest = proto.string(80226972, channel_id + continuation_info)
|
||||
|
||||
return base64.urlsafe_b64encode(pointless_nest).decode('ascii')
|
||||
|
||||
def channel_ctoken_v2(channel_id, page, sort, tab, view=1):
|
||||
# see https://github.com/iv-org/invidious/issues/1319#issuecomment-671732646
|
||||
# page > 1 doesn't work when sorting by oldest
|
||||
offset = 30*(int(page) - 1)
|
||||
schema_number = {
|
||||
3: 6307666885028338688,
|
||||
2: 17254859483345278706,
|
||||
1: 16570086088270825023,
|
||||
}[int(sort)]
|
||||
page_token = proto.string(61, proto.unpadded_b64encode(proto.string(1,
|
||||
proto.uint(1, schema_number) + proto.string(2,
|
||||
proto.string(1, proto.unpadded_b64encode(proto.uint(1,offset)))
|
||||
)
|
||||
)))
|
||||
|
||||
tab = proto.string(2, tab )
|
||||
sort = proto.uint(3, int(sort))
|
||||
#page = proto.string(15, str(page) )
|
||||
|
||||
shelf_view = proto.uint(4, 0)
|
||||
view = proto.uint(6, int(view))
|
||||
continuation_info = proto.string(3,
|
||||
proto.percent_b64encode(tab + sort + shelf_view + view + page_token)
|
||||
)
|
||||
|
||||
channel_id = proto.string(2, channel_id )
|
||||
pointless_nest = proto.string(80226972, channel_id + continuation_info)
|
||||
|
||||
return base64.urlsafe_b64encode(pointless_nest).decode('ascii')
|
||||
|
||||
def channel_ctoken_v1(channel_id, page, sort, tab, view=1):
|
||||
tab = proto.string(2, tab )
|
||||
sort = proto.uint(3, int(sort))
|
||||
page = proto.string(15, str(page) )
|
||||
# example with shelves in videos tab: https://www.youtube.com/channel/UCNL1ZadSjHpjm4q9j2sVtOA/videos
|
||||
shelf_view = proto.uint(4, 0)
|
||||
view = proto.uint(6, int(view))
|
||||
continuation_info = proto.string(3, proto.percent_b64encode(tab + view + sort + shelf_view + page + proto.uint(23, 0)) )
|
||||
|
||||
channel_id = proto.string(2, channel_id )
|
||||
pointless_nest = proto.string(80226972, channel_id + continuation_info)
|
||||
|
||||
return base64.urlsafe_b64encode(pointless_nest).decode('ascii')
|
||||
|
||||
def get_channel_tab(channel_id, page="1", sort=3, tab='videos', view=1, print_status=True):
|
||||
message = 'Got channel tab' if print_status else None
|
||||
|
||||
if int(sort) == 2 and int(page) > 1:
|
||||
ctoken = channel_ctoken_v1(channel_id, page, sort, tab, view)
|
||||
ctoken = ctoken.replace('=', '%3D')
|
||||
url = ('https://www.youtube.com/channel/' + channel_id + '/' + tab
|
||||
+ '?action_continuation=1&continuation=' + ctoken
|
||||
+ '&pbj=1')
|
||||
content = util.fetch_url(url, headers_desktop + real_cookie,
|
||||
debug_name='channel_tab', report_text=message)
|
||||
else:
|
||||
ctoken = channel_ctoken_v3(channel_id, page, sort, tab, view)
|
||||
ctoken = ctoken.replace('=', '%3D')
|
||||
url = 'https://www.youtube.com/browse_ajax?ctoken=' + ctoken
|
||||
content = util.fetch_url(url,
|
||||
headers_desktop + generic_cookie,
|
||||
debug_name='channel_tab', report_text=message)
|
||||
|
||||
return content
|
||||
|
||||
# cache entries expire after 30 minutes
|
||||
@cachetools.func.ttl_cache(maxsize=128, ttl=30*60)
|
||||
def get_number_of_videos_channel(channel_id):
|
||||
if channel_id is None:
|
||||
return 1000
|
||||
|
||||
# Uploads playlist
|
||||
playlist_id = 'UU' + channel_id[2:]
|
||||
url = 'https://m.youtube.com/playlist?list=' + playlist_id + '&pbj=1'
|
||||
|
||||
try:
|
||||
response = util.fetch_url(url, headers_mobile,
|
||||
debug_name='number_of_videos', report_text='Got number of videos')
|
||||
except urllib.error.HTTPError as e:
|
||||
traceback.print_exc()
|
||||
print("Couldn't retrieve number of videos")
|
||||
return 1000
|
||||
|
||||
response = response.decode('utf-8')
|
||||
|
||||
# match = re.search(r'"numVideosText":\s*{\s*"runs":\s*\[{"text":\s*"([\d,]*) videos"', response)
|
||||
match = re.search(r'"numVideosText".*?([,\d]+)', response)
|
||||
if match:
|
||||
return int(match.group(1).replace(',',''))
|
||||
else:
|
||||
return 0
|
||||
|
||||
channel_id_re = re.compile(r'videos\.xml\?channel_id=([a-zA-Z0-9_-]{24})"')
|
||||
@cachetools.func.lru_cache(maxsize=128)
|
||||
def get_channel_id(base_url):
|
||||
# method that gives the smallest possible response at ~4 kb
|
||||
# needs to be as fast as possible
|
||||
base_url = base_url.replace('https://www', 'https://m') # avoid redirect
|
||||
response = util.fetch_url(base_url + '/about?pbj=1', headers_mobile,
|
||||
debug_name='get_channel_id', report_text='Got channel id').decode('utf-8')
|
||||
match = channel_id_re.search(response)
|
||||
if match:
|
||||
return match.group(1)
|
||||
return None
|
||||
|
||||
def get_number_of_videos_general(base_url):
|
||||
return get_number_of_videos_channel(get_channel_id(base_url))
|
||||
|
||||
def get_channel_search_json(channel_id, query, page):
|
||||
params = proto.string(2, 'search') + proto.string(15, str(page))
|
||||
params = proto.percent_b64encode(params)
|
||||
ctoken = proto.string(2, channel_id) + proto.string(3, params) + proto.string(11, query)
|
||||
ctoken = base64.urlsafe_b64encode(proto.nested(80226972, ctoken)).decode('ascii')
|
||||
|
||||
polymer_json = util.fetch_url("https://www.youtube.com/browse_ajax?ctoken=" + ctoken, headers_desktop, debug_name='channel_search')
|
||||
|
||||
return polymer_json
|
||||
|
||||
|
||||
def post_process_channel_info(info):
|
||||
info['avatar'] = util.prefix_url(info['avatar'])
|
||||
info['channel_url'] = util.prefix_url(info['channel_url'])
|
||||
for item in info['items']:
|
||||
util.prefix_urls(item)
|
||||
util.add_extra_html_info(item)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
playlist_sort_codes = {'2': "da", '3': "dd", '4': "lad"}
|
||||
|
||||
# youtube.com/[channel_id]/[tab]
|
||||
# youtube.com/user/[username]/[tab]
|
||||
# youtube.com/c/[custom]/[tab]
|
||||
# youtube.com/[custom]/[tab]
|
||||
def get_channel_page_general_url(base_url, tab, request, channel_id=None):
|
||||
|
||||
page_number = int(request.args.get('page', 1))
|
||||
sort = request.args.get('sort', '3')
|
||||
view = request.args.get('view', '1')
|
||||
query = request.args.get('query', '')
|
||||
|
||||
if tab == 'videos' and channel_id:
|
||||
tasks = (
|
||||
gevent.spawn(get_number_of_videos_channel, channel_id),
|
||||
gevent.spawn(get_channel_tab, channel_id, page_number, sort, 'videos', view)
|
||||
)
|
||||
gevent.joinall(tasks)
|
||||
util.check_gevent_exceptions(*tasks)
|
||||
number_of_videos, polymer_json = tasks[0].value, tasks[1].value
|
||||
elif tab == 'videos':
|
||||
tasks = (
|
||||
gevent.spawn(get_number_of_videos_general, base_url),
|
||||
gevent.spawn(util.fetch_url, base_url + '/videos?pbj=1&view=0', headers_desktop, debug_name='gen_channel_videos')
|
||||
)
|
||||
gevent.joinall(tasks)
|
||||
util.check_gevent_exceptions(*tasks)
|
||||
number_of_videos, polymer_json = tasks[0].value, tasks[1].value
|
||||
elif tab == 'about':
|
||||
polymer_json = util.fetch_url(base_url + '/about?pbj=1', headers_desktop, debug_name='gen_channel_about')
|
||||
elif tab == 'playlists':
|
||||
polymer_json = util.fetch_url(base_url+ '/playlists?pbj=1&view=1&sort=' + playlist_sort_codes[sort], headers_desktop, debug_name='gen_channel_playlists')
|
||||
elif tab == 'search' and channel_id:
|
||||
polymer_json = get_channel_search_json(channel_id, query, page_number)
|
||||
elif tab == 'search':
|
||||
url = base_url + '/search?pbj=1&query=' + urllib.parse.quote(query, safe='')
|
||||
polymer_json = util.fetch_url(url, headers_desktop, debug_name='gen_channel_search')
|
||||
else:
|
||||
flask.abort(404, 'Unknown channel tab: ' + tab)
|
||||
|
||||
|
||||
info = yt_data_extract.extract_channel_info(json.loads(polymer_json), tab)
|
||||
if info['error'] is not None:
|
||||
return flask.render_template('error.html', error_message = info['error'])
|
||||
|
||||
post_process_channel_info(info)
|
||||
if tab == 'videos':
|
||||
info['number_of_videos'] = number_of_videos
|
||||
info['number_of_pages'] = math.ceil(number_of_videos/30)
|
||||
info['header_playlist_names'] = local_playlist.get_playlist_names()
|
||||
if tab in ('videos', 'playlists'):
|
||||
info['current_sort'] = sort
|
||||
elif tab == 'search':
|
||||
info['search_box_value'] = query
|
||||
info['header_playlist_names'] = local_playlist.get_playlist_names()
|
||||
info['page_number'] = page_number
|
||||
info['subscribed'] = subscriptions.is_subscribed(info['channel_id'])
|
||||
|
||||
return flask.render_template('channel.html',
|
||||
parameters_dictionary = request.args,
|
||||
**info
|
||||
)
|
||||
|
||||
@yt_app.route('/channel/<channel_id>/')
|
||||
@yt_app.route('/channel/<channel_id>/<tab>')
|
||||
def get_channel_page(channel_id, tab='videos'):
|
||||
return get_channel_page_general_url('https://www.youtube.com/channel/' + channel_id, tab, request, channel_id)
|
||||
|
||||
@yt_app.route('/user/<username>/')
|
||||
@yt_app.route('/user/<username>/<tab>')
|
||||
def get_user_page(username, tab='videos'):
|
||||
return get_channel_page_general_url('https://www.youtube.com/user/' + username, tab, request)
|
||||
|
||||
@yt_app.route('/c/<custom>/')
|
||||
@yt_app.route('/c/<custom>/<tab>')
|
||||
def get_custom_c_page(custom, tab='videos'):
|
||||
return get_channel_page_general_url('https://www.youtube.com/c/' + custom, tab, request)
|
||||
|
||||
@yt_app.route('/<custom>')
|
||||
@yt_app.route('/<custom>/<tab>')
|
||||
def get_toplevel_custom_page(custom, tab='videos'):
|
||||
return get_channel_page_general_url('https://www.youtube.com/' + custom, tab, request)
|
||||
|
145
youtube/comments.py
Normal file
145
youtube/comments.py
Normal file
@ -0,0 +1,145 @@
|
||||
import base64
|
||||
import json
|
||||
|
||||
from youtube import proto, util, yt_data_extract
|
||||
from youtube.util import concat_or_none
|
||||
|
||||
|
||||
# Here's what I know about the secret key (starting with ASJN_i)
|
||||
# *The secret key definitely contains the following information (or perhaps the information is stored at youtube's servers):
|
||||
# -Video id
|
||||
# -Offset
|
||||
# -Sort
|
||||
# *If the video id or sort in the ctoken contradicts the ASJN, the response is an error. The offset encoded outside the ASJN is ignored entirely.
|
||||
# *The ASJN is base64 encoded data, indicated by the fact that the character after "ASJN_i" is one of ("0", "1", "2", "3")
|
||||
# *The encoded data is not valid protobuf
|
||||
# *The encoded data (after the 5 or so bytes that are always the same) is indistinguishable from random data according to a battery of randomness tests
|
||||
# *The ASJN in the ctoken provided by a response changes in regular intervals of about a second or two.
|
||||
# *Old ASJN's continue to work, and start at the same comment even if new comments have been posted since
|
||||
# *The ASJN has no relation with any of the data in the response it came from
|
||||
|
||||
def make_comment_ctoken(video_id, sort=0, offset=0, lc='', secret_key=''):
|
||||
video_id = proto.as_bytes(video_id)
|
||||
secret_key = proto.as_bytes(secret_key)
|
||||
|
||||
|
||||
page_info = proto.string(4,video_id) + proto.uint(6, sort)
|
||||
offset_information = proto.nested(4, page_info) + proto.uint(5, offset)
|
||||
if secret_key:
|
||||
offset_information = proto.string(1, secret_key) + offset_information
|
||||
|
||||
page_params = proto.string(2, video_id)
|
||||
if lc:
|
||||
page_params += proto.string(6, proto.percent_b64encode(proto.string(15, lc)))
|
||||
|
||||
result = proto.nested(2, page_params) + proto.uint(3,6) + proto.nested(6, offset_information)
|
||||
return base64.urlsafe_b64encode(result).decode('ascii')
|
||||
|
||||
def comment_replies_ctoken(video_id, comment_id, max_results=500):
|
||||
|
||||
params = proto.string(2, comment_id) + proto.uint(9, max_results)
|
||||
params = proto.nested(3, params)
|
||||
|
||||
result = proto.nested(2, proto.string(2, video_id)) + proto.uint(3,6) + proto.nested(6, params)
|
||||
return base64.urlsafe_b64encode(result).decode('ascii')
|
||||
|
||||
|
||||
|
||||
mobile_headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_1 like Mac OS X) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/10.0 Mobile/14E304 Safari/602.1',
|
||||
'Accept': '*/*',
|
||||
'Accept-Language': 'en-US,en;q=0.5',
|
||||
'X-YouTube-Client-Name': '2',
|
||||
'X-YouTube-Client-Version': '2.20180823',
|
||||
}
|
||||
def request_comments(ctoken, replies=False):
|
||||
if replies: # let's make it use different urls for no reason despite all the data being encoded
|
||||
base_url = "https://m.youtube.com/watch_comment?action_get_comment_replies=1&ctoken="
|
||||
else:
|
||||
base_url = "https://m.youtube.com/watch_comment?action_get_comments=1&ctoken="
|
||||
url = base_url + ctoken.replace("=", "%3D") + "&pbj=1"
|
||||
|
||||
for i in range(0,8): # don't retry more than 8 times
|
||||
content = util.fetch_url(url, headers=mobile_headers, report_text="Retrieved comments", debug_name='request_comments')
|
||||
if content[0:4] == b")]}'": # random closing characters included at beginning of response for some reason
|
||||
content = content[4:]
|
||||
elif content[0:10] == b'\n<!DOCTYPE': # occasionally returns html instead of json for no reason
|
||||
content = b''
|
||||
print("got <!DOCTYPE>, retrying")
|
||||
continue
|
||||
break
|
||||
|
||||
polymer_json = json.loads(util.uppercase_escape(content.decode('utf-8')))
|
||||
return polymer_json
|
||||
|
||||
|
||||
def single_comment_ctoken(video_id, comment_id):
|
||||
page_params = proto.string(2, video_id) + proto.string(6, proto.percent_b64encode(proto.string(15, comment_id)))
|
||||
|
||||
result = proto.nested(2, page_params) + proto.uint(3,6)
|
||||
return base64.urlsafe_b64encode(result).decode('ascii')
|
||||
|
||||
|
||||
|
||||
def post_process_comments_info(comments_info):
|
||||
for comment in comments_info['comments']:
|
||||
comment['author_url'] = concat_or_none(
|
||||
util.URL_ORIGIN, comment['author_url'])
|
||||
comment['author_avatar'] = concat_or_none(
|
||||
'/', comment['author_avatar'])
|
||||
|
||||
comment['permalink'] = concat_or_none(util.URL_ORIGIN, '/watch?v=',
|
||||
comments_info['video_id'], '&lc=', comment['id'])
|
||||
|
||||
reply_count = comment['reply_count']
|
||||
if reply_count == 0:
|
||||
comment['replies_url'] = concat_or_none(util.URL_ORIGIN,
|
||||
'/post_comment?parent_id=', comment['id'],
|
||||
'&video_id=', comments_info['video_id'])
|
||||
else:
|
||||
comment['replies_url'] = concat_or_none(util.URL_ORIGIN,
|
||||
'/comments?parent_id=', comment['id'],
|
||||
'&video_id=', comments_info['video_id'])
|
||||
|
||||
if reply_count == 0:
|
||||
comment['view_replies_text'] = 'Reply'
|
||||
elif reply_count == 1:
|
||||
comment['view_replies_text'] = '1 reply'
|
||||
else:
|
||||
comment['view_replies_text'] = str(reply_count) + ' replies'
|
||||
|
||||
|
||||
if comment['like_count'] == 1:
|
||||
comment['likes_text'] = '1 like'
|
||||
else:
|
||||
comment['likes_text'] = str(comment['like_count']) + ' likes'
|
||||
|
||||
|
||||
if comments_info['ctoken']:
|
||||
comments_info['more_comments_url'] = concat_or_none(util.URL_ORIGIN,
|
||||
'/comments?ctoken=', comments_info['ctoken'])
|
||||
|
||||
comments_info['page_number'] = page_number = str(int(comments_info['offset']/20) + 1)
|
||||
|
||||
if not comments_info['is_replies']:
|
||||
comments_info['sort_text'] = 'top' if comments_info['sort'] == 0 else 'newest'
|
||||
|
||||
|
||||
comments_info['video_url'] = concat_or_none(util.URL_ORIGIN,
|
||||
'/watch?v=', comments_info['video_id'])
|
||||
comments_info['video_thumbnail'] = concat_or_none('/i.ytimg.com/vi/',
|
||||
comments_info['video_id'], '/mqdefault.jpg')
|
||||
|
||||
|
||||
def video_comments(video_id, sort=0, offset=0, lc='', secret_key=''):
|
||||
comments_info = yt_data_extract.extract_comments_info(request_comments(make_comment_ctoken(video_id, sort, offset, lc, secret_key)))
|
||||
post_process_comments_info(comments_info)
|
||||
|
||||
post_comment_url = util.URL_ORIGIN + "/post_comment?video_id=" + video_id
|
||||
other_sort_url = util.URL_ORIGIN + '/comments?ctoken=' + make_comment_ctoken(video_id, sort=1 - sort, lc=lc)
|
||||
other_sort_text = 'Sort by ' + ('newest' if sort == 0 else 'top')
|
||||
comments_info['comment_links'] = [('Post comment', post_comment_url), (other_sort_text, other_sort_url)]
|
||||
|
||||
return comments_info
|
||||
|
||||
return {}
|
11
youtube/opensearch.xml
Normal file
11
youtube/opensearch.xml
Normal file
@ -0,0 +1,11 @@
|
||||
<SearchPlugin xmlns="http://www.mozilla.org/2006/browser/search/">
|
||||
<ShortName>Youtube local</ShortName>
|
||||
<Description>no CIA shit in the background</Description>
|
||||
<InputEncoding>UTF-8</InputEncoding>
|
||||
<Image width="16" height="16"></Image>
|
||||
|
||||
<Url type="text/html" method="GET" template="http://localhost:$port_number/youtube.com/search">
|
||||
<Param name="query" value="{searchTerms}"/>
|
||||
</Url>
|
||||
<SearchForm>http://localhost:$port_number/youtube.com/search</SearchForm>
|
||||
</SearchPlugin>
|
123
youtube/playlist.py
Normal file
123
youtube/playlist.py
Normal file
@ -0,0 +1,123 @@
|
||||
from youtube import util, yt_data_extract, proto, local_playlist
|
||||
from youtube import yt_app
|
||||
|
||||
import base64
|
||||
import urllib
|
||||
import json
|
||||
import string
|
||||
import gevent
|
||||
import math
|
||||
from flask import request
|
||||
import flask
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def playlist_ctoken(playlist_id, offset):
|
||||
|
||||
offset = proto.uint(1, offset)
|
||||
# this is just obfuscation as far as I can tell. It doesn't even follow protobuf
|
||||
offset = b'PT:' + proto.unpadded_b64encode(offset)
|
||||
offset = proto.string(15, offset)
|
||||
|
||||
continuation_info = proto.string( 3, proto.percent_b64encode(offset) )
|
||||
|
||||
playlist_id = proto.string(2, 'VL' + playlist_id )
|
||||
pointless_nest = proto.string(80226972, playlist_id + continuation_info)
|
||||
|
||||
return base64.urlsafe_b64encode(pointless_nest).decode('ascii')
|
||||
|
||||
# initial request types:
|
||||
# polymer_json: https://m.youtube.com/playlist?list=PLv3TTBr1W_9tppikBxAE_G6qjWdBljBHJ&pbj=1&lact=0
|
||||
# ajax json: https://m.youtube.com/playlist?list=PLv3TTBr1W_9tppikBxAE_G6qjWdBljBHJ&pbj=1&lact=0 with header X-YouTube-Client-Version: 1.20180418
|
||||
|
||||
|
||||
# continuation request types:
|
||||
# polymer_json: https://m.youtube.com/playlist?&ctoken=[...]&pbj=1
|
||||
# ajax json: https://m.youtube.com/playlist?action_continuation=1&ajax=1&ctoken=[...]
|
||||
|
||||
|
||||
headers_1 = (
|
||||
('Accept', '*/*'),
|
||||
('Accept-Language', 'en-US,en;q=0.5'),
|
||||
('X-YouTube-Client-Name', '2'),
|
||||
('X-YouTube-Client-Version', '2.20180614'),
|
||||
)
|
||||
|
||||
def playlist_first_page(playlist_id, report_text = "Retrieved playlist"):
|
||||
url = 'https://m.youtube.com/playlist?list=' + playlist_id + '&pbj=1'
|
||||
content = util.fetch_url(url, util.mobile_ua + headers_1, report_text=report_text, debug_name='playlist_first_page')
|
||||
content = json.loads(util.uppercase_escape(content.decode('utf-8')))
|
||||
|
||||
return content
|
||||
|
||||
|
||||
#https://m.youtube.com/playlist?itct=CBMQybcCIhMIptj9xJaJ2wIV2JKcCh3Idwu-&ctoken=4qmFsgI2EiRWTFBMT3kwajlBdmxWWlB0bzZJa2pLZnB1MFNjeC0tN1BHVEMaDmVnWlFWRHBEUWxFJTNE&pbj=1
|
||||
def get_videos(playlist_id, page):
|
||||
|
||||
url = "https://m.youtube.com/playlist?ctoken=" + playlist_ctoken(playlist_id, (int(page)-1)*20) + "&pbj=1"
|
||||
headers = {
|
||||
'User-Agent': ' Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_1 like Mac OS X) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/10.0 Mobile/14E304 Safari/602.1',
|
||||
'Accept': '*/*',
|
||||
'Accept-Language': 'en-US,en;q=0.5',
|
||||
'X-YouTube-Client-Name': '2',
|
||||
'X-YouTube-Client-Version': '2.20180508',
|
||||
}
|
||||
|
||||
content = util.fetch_url(url, headers, report_text="Retrieved playlist", debug_name='playlist_videos')
|
||||
|
||||
info = json.loads(util.uppercase_escape(content.decode('utf-8')))
|
||||
return info
|
||||
|
||||
|
||||
@yt_app.route('/playlist')
|
||||
def get_playlist_page():
|
||||
if 'list' not in request.args:
|
||||
abort(400)
|
||||
|
||||
playlist_id = request.args.get('list')
|
||||
page = request.args.get('page', '1')
|
||||
|
||||
if page == '1':
|
||||
first_page_json = playlist_first_page(playlist_id)
|
||||
this_page_json = first_page_json
|
||||
else:
|
||||
tasks = (
|
||||
gevent.spawn(playlist_first_page, playlist_id, report_text="Retrieved playlist info" ),
|
||||
gevent.spawn(get_videos, playlist_id, page)
|
||||
)
|
||||
gevent.joinall(tasks)
|
||||
util.check_gevent_exceptions(*tasks)
|
||||
first_page_json, this_page_json = tasks[0].value, tasks[1].value
|
||||
|
||||
info = yt_data_extract.extract_playlist_info(this_page_json)
|
||||
if info['error']:
|
||||
return flask.render_template('error.html', error_message = info['error'])
|
||||
|
||||
if page != '1':
|
||||
info['metadata'] = yt_data_extract.extract_playlist_metadata(first_page_json)
|
||||
|
||||
util.prefix_urls(info['metadata'])
|
||||
for item in info.get('items', ()):
|
||||
util.prefix_urls(item)
|
||||
util.add_extra_html_info(item)
|
||||
if 'id' in item:
|
||||
item['thumbnail'] = '/https://i.ytimg.com/vi/' + item['id'] + '/default.jpg'
|
||||
|
||||
item['url'] += '&list=' + playlist_id
|
||||
if item['index']:
|
||||
item['url'] += '&index=' + str(item['index'])
|
||||
|
||||
video_count = yt_data_extract.deep_get(info, 'metadata', 'video_count')
|
||||
if video_count is None:
|
||||
video_count = 40
|
||||
|
||||
return flask.render_template('playlist.html',
|
||||
header_playlist_names = local_playlist.get_playlist_names(),
|
||||
video_list = info.get('items', []),
|
||||
num_pages = math.ceil(video_count/20),
|
||||
parameters_dictionary = request.args,
|
||||
|
||||
**info['metadata']
|
||||
).encode('utf-8')
|
129
youtube/proto.py
Normal file
129
youtube/proto.py
Normal file
@ -0,0 +1,129 @@
|
||||
from math import ceil
|
||||
import base64
|
||||
import io
|
||||
|
||||
def byte(n):
|
||||
return bytes((n,))
|
||||
|
||||
|
||||
def varint_encode(offset):
|
||||
'''In this encoding system, for each 8-bit byte, the first bit is 1 if there are more bytes, and 0 is this is the last one.
|
||||
The next 7 bits are data. These 7-bit sections represent the data in Little endian order. For example, suppose the data is
|
||||
aaaaaaabbbbbbbccccccc (each of these sections is 7 bits). It will be encoded as:
|
||||
1ccccccc 1bbbbbbb 0aaaaaaa
|
||||
|
||||
This encoding is used in youtube parameters to encode offsets and to encode the length for length-prefixed data.
|
||||
See https://developers.google.com/protocol-buffers/docs/encoding#varints for more info.'''
|
||||
needed_bytes = ceil(offset.bit_length()/7) or 1 # (0).bit_length() returns 0, but we need 1 in that case.
|
||||
encoded_bytes = bytearray(needed_bytes)
|
||||
for i in range(0, needed_bytes - 1):
|
||||
encoded_bytes[i] = (offset & 127) | 128 # 7 least significant bits
|
||||
offset = offset >> 7
|
||||
encoded_bytes[-1] = offset & 127 # leave first bit as zero for last byte
|
||||
|
||||
return bytes(encoded_bytes)
|
||||
|
||||
|
||||
def varint_decode(encoded):
|
||||
decoded = 0
|
||||
for i, byte in enumerate(encoded):
|
||||
decoded |= (byte & 127) << 7*i
|
||||
|
||||
if not (byte & 128):
|
||||
break
|
||||
return decoded
|
||||
|
||||
|
||||
def string(field_number, data):
|
||||
data = as_bytes(data)
|
||||
return _proto_field(2, field_number, varint_encode(len(data)) + data)
|
||||
nested = string
|
||||
|
||||
def uint(field_number, value):
|
||||
return _proto_field(0, field_number, varint_encode(value))
|
||||
|
||||
|
||||
|
||||
|
||||
def _proto_field(wire_type, field_number, data):
|
||||
''' See https://developers.google.com/protocol-buffers/docs/encoding#structure '''
|
||||
return varint_encode( (field_number << 3) | wire_type) + data
|
||||
|
||||
|
||||
|
||||
def percent_b64encode(data):
|
||||
return base64.urlsafe_b64encode(data).replace(b'=', b'%3D')
|
||||
|
||||
|
||||
def unpadded_b64encode(data):
|
||||
return base64.urlsafe_b64encode(data).replace(b'=', b'')
|
||||
|
||||
def as_bytes(value):
|
||||
if isinstance(value, str):
|
||||
return value.encode('utf-8')
|
||||
return value
|
||||
|
||||
|
||||
def read_varint(data):
|
||||
result = 0
|
||||
i = 0
|
||||
while True:
|
||||
try:
|
||||
byte = data.read(1)[0]
|
||||
except IndexError:
|
||||
if i == 0:
|
||||
raise EOFError()
|
||||
raise Exception('Unterminated varint starting at ' + str(data.tell() - i))
|
||||
result |= (byte & 127) << 7*i
|
||||
if not byte & 128:
|
||||
break
|
||||
|
||||
i += 1
|
||||
return result
|
||||
|
||||
|
||||
def read_group(data, end_sequence):
|
||||
start = data.tell()
|
||||
index = data.original.find(end_sequence, start)
|
||||
if index == -1:
|
||||
raise Exception('Unterminated group')
|
||||
data.seek(index + len(end_sequence))
|
||||
return data.original[start:index]
|
||||
|
||||
def read_protobuf(data):
|
||||
data_original = data
|
||||
data = io.BytesIO(data)
|
||||
data.original = data_original
|
||||
while True:
|
||||
try:
|
||||
tag = read_varint(data)
|
||||
except EOFError:
|
||||
break
|
||||
wire_type = tag & 7
|
||||
field_number = tag >> 3
|
||||
|
||||
if wire_type == 0:
|
||||
value = read_varint(data)
|
||||
elif wire_type == 1:
|
||||
value = data.read(8)
|
||||
elif wire_type == 2:
|
||||
length = read_varint(data)
|
||||
value = data.read(length)
|
||||
elif wire_type == 3:
|
||||
end_bytes = encode_varint((field_number << 3) | 4)
|
||||
value = read_group(data, end_bytes)
|
||||
elif wire_type == 5:
|
||||
value = data.read(4)
|
||||
else:
|
||||
raise Exception("Unknown wire type: " + str(wire_type) + ", Tag: " + bytes_to_hex(succinct_encode(tag)) + ", at position " + str(data.tell()))
|
||||
yield (wire_type, field_number, value)
|
||||
|
||||
def parse(data):
|
||||
return {field_number: value for _, field_number, value in read_protobuf(data)}
|
||||
|
||||
def b64_to_bytes(data):
|
||||
if isinstance(data, bytes):
|
||||
data = data.decode('ascii')
|
||||
data = data.replace("%3D", "=")
|
||||
return base64.urlsafe_b64decode(data + "="*((4 - len(data)%4)%4) )
|
||||
|
105
youtube/search.py
Normal file
105
youtube/search.py
Normal file
@ -0,0 +1,105 @@
|
||||
import base64
|
||||
import json
|
||||
import urllib
|
||||
|
||||
import flask
|
||||
from flask import request
|
||||
from werkzeug.exceptions import abort
|
||||
|
||||
from youtube import util, yt_data_extract, proto
|
||||
from youtube import yt_app
|
||||
|
||||
# Sort: 1
|
||||
# Upload date: 2
|
||||
# View count: 3
|
||||
# Rating: 1
|
||||
# Relevance: 0
|
||||
# Offset: 9
|
||||
# Filters: 2
|
||||
# Upload date: 1
|
||||
# Type: 2
|
||||
# Duration: 3
|
||||
|
||||
|
||||
features = {
|
||||
'4k': 14,
|
||||
'hd': 4,
|
||||
'hdr': 25,
|
||||
'subtitles': 5,
|
||||
'creative_commons': 6,
|
||||
'3d': 7,
|
||||
'live': 8,
|
||||
'purchased': 9,
|
||||
'360': 15,
|
||||
'location': 23,
|
||||
}
|
||||
|
||||
def page_number_to_sp_parameter(page, autocorrect, sort, filters):
|
||||
offset = (int(page) - 1)*20 # 20 results per page
|
||||
autocorrect = proto.nested(8, proto.uint(1, 1 - int(autocorrect) ))
|
||||
filters_enc = proto.nested(2, proto.uint(1, filters['time']) + proto.uint(2, filters['type']) + proto.uint(3, filters['duration']))
|
||||
result = proto.uint(1, sort) + filters_enc + autocorrect + proto.uint(9, offset) + proto.string(61, b'')
|
||||
return base64.urlsafe_b64encode(result).decode('ascii')
|
||||
|
||||
def get_search_json(query, page, autocorrect, sort, filters):
|
||||
url = "https://www.youtube.com/results?search_query=" + urllib.parse.quote_plus(query)
|
||||
headers = {
|
||||
'Host': 'www.youtube.com',
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)',
|
||||
'Accept': '*/*',
|
||||
'Accept-Language': 'en-US,en;q=0.5',
|
||||
'X-YouTube-Client-Name': '1',
|
||||
'X-YouTube-Client-Version': '2.20180418',
|
||||
}
|
||||
url += "&pbj=1&sp=" + page_number_to_sp_parameter(page, autocorrect, sort, filters).replace("=", "%3D")
|
||||
content = util.fetch_url(url, headers=headers, report_text="Got search results", debug_name='search_results')
|
||||
info = json.loads(content)
|
||||
return info
|
||||
|
||||
|
||||
@yt_app.route('/search')
|
||||
def get_search_page():
|
||||
if len(request.args) == 0:
|
||||
return flask.render_template('base.html', title="Search")
|
||||
|
||||
if 'query' not in request.args:
|
||||
abort(400)
|
||||
|
||||
query = request.args.get("query")
|
||||
page = request.args.get("page", "1")
|
||||
autocorrect = int(request.args.get("autocorrect", "1"))
|
||||
sort = int(request.args.get("sort", "0"))
|
||||
filters = {}
|
||||
filters['time'] = int(request.args.get("time", "0"))
|
||||
filters['type'] = int(request.args.get("type", "0"))
|
||||
filters['duration'] = int(request.args.get("duration", "0"))
|
||||
polymer_json = get_search_json(query, page, autocorrect, sort, filters)
|
||||
|
||||
search_info = yt_data_extract.extract_search_info(polymer_json)
|
||||
if search_info['error']:
|
||||
return flask.render_template('error.html', error_message = search_info['error'])
|
||||
|
||||
for extract_item_info in search_info['items']:
|
||||
util.prefix_urls(extract_item_info)
|
||||
util.add_extra_html_info(extract_item_info)
|
||||
|
||||
corrections = search_info['corrections']
|
||||
if corrections['type'] == 'did_you_mean':
|
||||
corrected_query_string = request.args.to_dict(flat=False)
|
||||
corrected_query_string['query'] = [corrections['corrected_query']]
|
||||
corrections['corrected_query_url'] = util.URL_ORIGIN + '/search?' + urllib.parse.urlencode(corrected_query_string, doseq=True)
|
||||
elif corrections['type'] == 'showing_results_for':
|
||||
no_autocorrect_query_string = request.args.to_dict(flat=False)
|
||||
no_autocorrect_query_string['autocorrect'] = ['0']
|
||||
no_autocorrect_query_url = util.URL_ORIGIN + '/search?' + urllib.parse.urlencode(no_autocorrect_query_string, doseq=True)
|
||||
corrections['original_query_url'] = no_autocorrect_query_url
|
||||
|
||||
return flask.render_template('search.html',
|
||||
header_playlist_names = local_playlist.get_playlist_names(),
|
||||
query = query,
|
||||
estimated_results = search_info['estimated_results'],
|
||||
estimated_pages = search_info['estimated_pages'],
|
||||
corrections = search_info['corrections'],
|
||||
results = search_info['items'],
|
||||
parameters_dictionary = request.args,
|
||||
)
|
397
youtube/util.py
Normal file
397
youtube/util.py
Normal file
@ -0,0 +1,397 @@
|
||||
import gzip
|
||||
|
||||
from youtube import yt_data_extract
|
||||
|
||||
try:
|
||||
import brotli
|
||||
have_brotli = True
|
||||
except ImportError:
|
||||
have_brotli = False
|
||||
import urllib.parse
|
||||
import re
|
||||
import time
|
||||
import os
|
||||
import json
|
||||
import gevent
|
||||
import gevent.queue
|
||||
import gevent.lock
|
||||
|
||||
# The trouble with the requests library: It ships its own certificate bundle via certifi
|
||||
# instead of using the system certificate store, meaning self-signed certificates
|
||||
# configured by the user will not work. Some draconian networks block TLS unless a corporate
|
||||
# certificate is installed on the system. Additionally, some users install a self signed cert
|
||||
# in order to use programs to modify or monitor requests made by programs on the system.
|
||||
|
||||
# Finally, certificates expire and need to be updated, or are sometimes revoked. Sometimes
|
||||
# certificate authorites go rogue and need to be untrusted. Since we are going through Tor exit nodes,
|
||||
# this becomes all the more important. A rogue CA could issue a fake certificate for accounts.google.com, and a
|
||||
# malicious exit node could use this to decrypt traffic when logging in and retrieve passwords. Examples:
|
||||
# https://www.engadget.com/2015/10/29/google-warns-symantec-over-certificates/
|
||||
# https://nakedsecurity.sophos.com/2013/12/09/serious-security-google-finds-fake-but-trusted-ssl-certificates-for-its-domains-made-in-france/
|
||||
|
||||
# In the requests documentation it says:
|
||||
# "Before version 2.16, Requests bundled a set of root CAs that it trusted, sourced from the Mozilla trust store.
|
||||
# The certificates were only updated once for each Requests version. When certifi was not installed,
|
||||
# this led to extremely out-of-date certificate bundles when using significantly older versions of Requests.
|
||||
# For the sake of security we recommend upgrading certifi frequently!"
|
||||
# (http://docs.python-requests.org/en/master/user/advanced/#ca-certificates)
|
||||
|
||||
# Expecting users to remember to manually update certifi on Linux isn't reasonable in my view.
|
||||
# On windows, this is even worse since I am distributing all dependencies. This program is not
|
||||
# updated frequently, and using requests would lead to outdated certificates. Certificates
|
||||
# should be updated with OS updates, instead of thousands of developers of different programs
|
||||
# being expected to do this correctly 100% of the time.
|
||||
|
||||
# There is hope that this might be fixed eventually:
|
||||
# https://github.com/kennethreitz/requests/issues/2966
|
||||
|
||||
# Until then, I will use a mix of urllib3 and urllib.
|
||||
import urllib3
|
||||
import urllib3.contrib.socks
|
||||
|
||||
URL_ORIGIN = "/https://www.youtube.com"
|
||||
|
||||
connection_pool = urllib3.PoolManager(cert_reqs = 'CERT_REQUIRED')
|
||||
|
||||
def get_pool(use_tor):
|
||||
return connection_pool
|
||||
|
||||
class HTTPAsymmetricCookieProcessor(urllib.request.BaseHandler):
|
||||
'''Separate cookiejars for receiving and sending'''
|
||||
def __init__(self, cookiejar_send=None, cookiejar_receive=None):
|
||||
self.cookiejar_send = cookiejar_send
|
||||
self.cookiejar_receive = cookiejar_receive
|
||||
|
||||
def http_request(self, request):
|
||||
if self.cookiejar_send is not None:
|
||||
self.cookiejar_send.add_cookie_header(request)
|
||||
return request
|
||||
|
||||
def http_response(self, request, response):
|
||||
if self.cookiejar_receive is not None:
|
||||
self.cookiejar_receive.extract_cookies(response, request)
|
||||
return response
|
||||
|
||||
https_request = http_request
|
||||
https_response = http_response
|
||||
|
||||
class FetchError(Exception):
|
||||
def __init__(self, code, reason='', ip=None):
|
||||
Exception.__init__(self, 'HTTP error during request: ' + code + ' ' + reason)
|
||||
self.code = code
|
||||
self.reason = reason
|
||||
self.ip = ip
|
||||
|
||||
def decode_content(content, encoding_header):
|
||||
encodings = encoding_header.replace(' ', '').split(',')
|
||||
for encoding in reversed(encodings):
|
||||
if encoding == 'identity':
|
||||
continue
|
||||
if encoding == 'br':
|
||||
content = brotli.decompress(content)
|
||||
elif encoding == 'gzip':
|
||||
content = gzip.decompress(content)
|
||||
return content
|
||||
|
||||
def fetch_url_response(url, headers=(), timeout=15, data=None,
|
||||
cookiejar_send=None, cookiejar_receive=None,
|
||||
use_tor=True, max_redirects=None):
|
||||
'''
|
||||
returns response, cleanup_function
|
||||
When cookiejar_send is set to a CookieJar object,
|
||||
those cookies will be sent in the request (but cookies in response will not be merged into it)
|
||||
When cookiejar_receive is set to a CookieJar object,
|
||||
cookies received in the response will be merged into the object (nothing will be sent from it)
|
||||
When both are set to the same object, cookies will be sent from the object,
|
||||
and response cookies will be merged into it.
|
||||
'''
|
||||
headers = dict(headers) # Note: Calling dict() on a dict will make a copy
|
||||
if have_brotli:
|
||||
headers['Accept-Encoding'] = 'gzip, br'
|
||||
else:
|
||||
headers['Accept-Encoding'] = 'gzip'
|
||||
|
||||
# prevent python version being leaked by urllib if User-Agent isn't provided
|
||||
# (urllib will use ex. Python-urllib/3.6 otherwise)
|
||||
if 'User-Agent' not in headers and 'user-agent' not in headers and 'User-agent' not in headers:
|
||||
headers['User-Agent'] = 'Python-urllib'
|
||||
|
||||
method = "GET"
|
||||
if data is not None:
|
||||
method = "POST"
|
||||
if isinstance(data, str):
|
||||
data = data.encode('ascii')
|
||||
elif not isinstance(data, bytes):
|
||||
data = urllib.parse.urlencode(data).encode('ascii')
|
||||
|
||||
if cookiejar_send is not None or cookiejar_receive is not None: # Use urllib
|
||||
req = urllib.request.Request(url, data=data, headers=headers)
|
||||
|
||||
cookie_processor = HTTPAsymmetricCookieProcessor(cookiejar_send=cookiejar_send, cookiejar_receive=cookiejar_receive)
|
||||
opener = urllib.request.build_opener(cookie_processor)
|
||||
|
||||
response = opener.open(req, timeout=timeout)
|
||||
cleanup_func = (lambda r: None)
|
||||
|
||||
else: # Use a urllib3 pool. Cookies can't be used since urllib3 doesn't have easy support for them.
|
||||
# default: Retry.DEFAULT = Retry(3)
|
||||
# (in connectionpool.py in urllib3)
|
||||
# According to the documentation for urlopen, a redirect counts as a
|
||||
# retry. So there are 3 redirects max by default.
|
||||
if max_redirects:
|
||||
retries = urllib3.Retry(3+max_redirects, redirect=max_redirects)
|
||||
else:
|
||||
retries = urllib3.Retry(3)
|
||||
pool = get_pool(use_tor)
|
||||
response = pool.request(method, url, headers=headers,
|
||||
timeout=timeout, preload_content=False,
|
||||
decode_content=False, retries=retries)
|
||||
cleanup_func = (lambda r: r.release_conn())
|
||||
|
||||
return response, cleanup_func
|
||||
|
||||
def fetch_url(url, headers=(), timeout=15, report_text=None, data=None,
|
||||
cookiejar_send=None, cookiejar_receive=None, use_tor=True,
|
||||
debug_name=None):
|
||||
start_time = time.time()
|
||||
|
||||
response, cleanup_func = fetch_url_response(
|
||||
url, headers, timeout=timeout,
|
||||
cookiejar_send=cookiejar_send, cookiejar_receive=cookiejar_receive,
|
||||
use_tor=use_tor)
|
||||
response_time = time.time()
|
||||
|
||||
content = response.read()
|
||||
read_finish = time.time()
|
||||
|
||||
cleanup_func(response) # release_connection for urllib3
|
||||
|
||||
if (response.status == 429
|
||||
and content.startswith(b'<!DOCTYPE')
|
||||
and b'Our systems have detected unusual traffic' in content):
|
||||
ip = re.search(br'IP address: ((?:[\da-f]*:)+[\da-f]+|(?:\d+\.)+\d+)',
|
||||
content)
|
||||
ip = ip.group(1).decode('ascii') if ip else None
|
||||
raise FetchError('429', reason=response.reason, ip=ip)
|
||||
|
||||
elif response.status >= 400:
|
||||
raise FetchError(str(response.status), reason=response.reason, ip=None)
|
||||
|
||||
if report_text:
|
||||
print(report_text, ' Latency:', round(response_time - start_time,3), ' Read time:', round(read_finish - response_time,3))
|
||||
content = decode_content(content, response.getheader('Content-Encoding', default='identity'))
|
||||
return content
|
||||
|
||||
def head(url, use_tor=False, report_text=None, max_redirects=10):
|
||||
pool = get_pool(use_tor)
|
||||
start_time = time.time()
|
||||
|
||||
# default: Retry.DEFAULT = Retry(3)
|
||||
# (in connectionpool.py in urllib3)
|
||||
# According to the documentation for urlopen, a redirect counts as a retry
|
||||
# So there are 3 redirects max by default. Let's change that
|
||||
# to 10 since googlevideo redirects a lot.
|
||||
retries = urllib3.Retry(3+max_redirects, redirect=max_redirects,
|
||||
raise_on_redirect=False)
|
||||
headers = {'User-Agent': 'Python-urllib'}
|
||||
response = pool.request('HEAD', url, headers=headers, retries=retries)
|
||||
if report_text:
|
||||
print(report_text, ' Latency:', round(time.time() - start_time,3))
|
||||
return response
|
||||
|
||||
mobile_user_agent = 'Mozilla/5.0 (Linux; Android 7.0; Redmi Note 4 Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Mobile Safari/537.36'
|
||||
mobile_ua = (('User-Agent', mobile_user_agent),)
|
||||
desktop_user_agent = 'Mozilla/5.0 (Windows NT 6.1; rv:52.0) Gecko/20100101 Firefox/52.0'
|
||||
desktop_ua = (('User-Agent', desktop_user_agent),)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
class RateLimitedQueue(gevent.queue.Queue):
|
||||
''' Does initial_burst (def. 30) at first, then alternates between waiting waiting_period (def. 5) seconds and doing subsequent_bursts (def. 10) queries. After 5 seconds with nothing left in the queue, resets rate limiting. '''
|
||||
|
||||
def __init__(self, initial_burst=30, waiting_period=5, subsequent_bursts=10):
|
||||
self.initial_burst = initial_burst
|
||||
self.waiting_period = waiting_period
|
||||
self.subsequent_bursts = subsequent_bursts
|
||||
|
||||
self.count_since_last_wait = 0
|
||||
self.surpassed_initial = False
|
||||
|
||||
self.lock = gevent.lock.BoundedSemaphore(1)
|
||||
self.currently_empty = False
|
||||
self.empty_start = 0
|
||||
gevent.queue.Queue.__init__(self)
|
||||
|
||||
|
||||
def get(self):
|
||||
self.lock.acquire() # blocks if another greenlet currently has the lock
|
||||
if self.count_since_last_wait >= self.subsequent_bursts and self.surpassed_initial:
|
||||
gevent.sleep(self.waiting_period)
|
||||
self.count_since_last_wait = 0
|
||||
|
||||
elif self.count_since_last_wait >= self.initial_burst and not self.surpassed_initial:
|
||||
self.surpassed_initial = True
|
||||
gevent.sleep(self.waiting_period)
|
||||
self.count_since_last_wait = 0
|
||||
|
||||
self.count_since_last_wait += 1
|
||||
|
||||
if not self.currently_empty and self.empty():
|
||||
self.currently_empty = True
|
||||
self.empty_start = time.monotonic()
|
||||
|
||||
item = gevent.queue.Queue.get(self) # blocks when nothing left
|
||||
|
||||
if self.currently_empty:
|
||||
if time.monotonic() - self.empty_start >= self.waiting_period:
|
||||
self.count_since_last_wait = 0
|
||||
self.surpassed_initial = False
|
||||
|
||||
self.currently_empty = False
|
||||
|
||||
self.lock.release()
|
||||
|
||||
return item
|
||||
|
||||
|
||||
|
||||
def download_thumbnail(save_directory, video_id):
|
||||
url = "https://i.ytimg.com/vi/" + video_id + "/mqdefault.jpg"
|
||||
save_location = os.path.join(save_directory, video_id + ".jpg")
|
||||
try:
|
||||
thumbnail = fetch_url(url, report_text="Saved thumbnail: " + video_id)
|
||||
except urllib.error.HTTPError as e:
|
||||
print("Failed to download thumbnail for " + video_id + ": " + str(e))
|
||||
return False
|
||||
try:
|
||||
f = open(save_location, 'wb')
|
||||
except FileNotFoundError:
|
||||
os.makedirs(save_directory, exist_ok = True)
|
||||
f = open(save_location, 'wb')
|
||||
f.write(thumbnail)
|
||||
f.close()
|
||||
return True
|
||||
|
||||
def download_thumbnails(save_directory, ids):
|
||||
if not isinstance(ids, (list, tuple)):
|
||||
ids = list(ids)
|
||||
# only do 5 at a time
|
||||
# do the n where n is divisible by 5
|
||||
i = -1
|
||||
for i in range(0, int(len(ids)/5) - 1 ):
|
||||
gevent.joinall([gevent.spawn(download_thumbnail, save_directory, ids[j]) for j in range(i*5, i*5 + 5)])
|
||||
# do the remainders (< 5)
|
||||
gevent.joinall([gevent.spawn(download_thumbnail, save_directory, ids[j]) for j in range(i*5 + 5, len(ids))])
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def dict_add(*dicts):
|
||||
for dictionary in dicts[1:]:
|
||||
dicts[0].update(dictionary)
|
||||
return dicts[0]
|
||||
|
||||
def video_id(url):
|
||||
url_parts = urllib.parse.urlparse(url)
|
||||
return urllib.parse.parse_qs(url_parts.query)['v'][0]
|
||||
|
||||
|
||||
# default, sddefault, mqdefault, hqdefault, hq720
|
||||
def get_thumbnail_url(video_id):
|
||||
return "/i.ytimg.com/vi/" + video_id + "/mqdefault.jpg"
|
||||
|
||||
def seconds_to_timestamp(seconds):
|
||||
seconds = int(seconds)
|
||||
hours, seconds = divmod(seconds,3600)
|
||||
minutes, seconds = divmod(seconds,60)
|
||||
if hours != 0:
|
||||
timestamp = str(hours) + ":"
|
||||
timestamp += str(minutes).zfill(2) # zfill pads with zeros
|
||||
else:
|
||||
timestamp = str(minutes)
|
||||
|
||||
timestamp += ":" + str(seconds).zfill(2)
|
||||
return timestamp
|
||||
|
||||
|
||||
|
||||
def update_query_string(query_string, items):
|
||||
parameters = urllib.parse.parse_qs(query_string)
|
||||
parameters.update(items)
|
||||
return urllib.parse.urlencode(parameters, doseq=True)
|
||||
|
||||
|
||||
|
||||
def uppercase_escape(s):
|
||||
return re.sub(
|
||||
r'\\U([0-9a-fA-F]{8})',
|
||||
lambda m: chr(int(m.group(1), base=16)), s)
|
||||
|
||||
def prefix_url(url):
|
||||
if url is None:
|
||||
return None
|
||||
url = url.lstrip('/') # some urls have // before them, which has a special meaning
|
||||
return '/' + url
|
||||
|
||||
def left_remove(string, substring):
|
||||
'''removes substring from the start of string, if present'''
|
||||
if string.startswith(substring):
|
||||
return string[len(substring):]
|
||||
return string
|
||||
|
||||
def concat_or_none(*strings):
|
||||
'''Concatenates strings. Returns None if any of the arguments are None'''
|
||||
result = ''
|
||||
for string in strings:
|
||||
if string is None:
|
||||
return None
|
||||
result += string
|
||||
return result
|
||||
|
||||
|
||||
def prefix_urls(item):
|
||||
try:
|
||||
item['thumbnail'] = prefix_url(item['thumbnail'])
|
||||
except KeyError:
|
||||
pass
|
||||
|
||||
try:
|
||||
item['author_url'] = prefix_url(item['author_url'])
|
||||
except KeyError:
|
||||
pass
|
||||
|
||||
def add_extra_html_info(item):
|
||||
if item['type'] == 'video':
|
||||
item['url'] = (URL_ORIGIN + '/watch?v=' + item['id']) if item.get('id') else None
|
||||
|
||||
video_info = {}
|
||||
for key in ('id', 'title', 'author', 'duration'):
|
||||
try:
|
||||
video_info[key] = item[key]
|
||||
except KeyError:
|
||||
video_info[key] = ''
|
||||
|
||||
item['video_info'] = json.dumps(video_info)
|
||||
|
||||
elif item['type'] == 'playlist':
|
||||
item['url'] = (URL_ORIGIN + '/playlist?list=' + item['id']) if item.get('id') else None
|
||||
elif item['type'] == 'channel':
|
||||
item['url'] = (URL_ORIGIN + "/channel/" + item['id']) if item.get('id') else None
|
||||
|
||||
def parse_info_prepare_for_html(renderer, additional_info={}):
|
||||
item = yt_data_extract.extract_item_info(renderer, additional_info)
|
||||
prefix_urls(item)
|
||||
add_extra_html_info(item)
|
||||
|
||||
return item
|
||||
|
||||
def check_gevent_exceptions(*tasks):
|
||||
for task in tasks:
|
||||
if task.exception:
|
||||
raise task.exception
|
||||
|
61
youtube/utils.py
Normal file
61
youtube/utils.py
Normal file
@ -0,0 +1,61 @@
|
||||
import urllib
|
||||
from flask import Markup
|
||||
import bleach
|
||||
def get_description_snippet_text(ds):
|
||||
string = ""
|
||||
for t in ds:
|
||||
try:
|
||||
if t['bold']:
|
||||
text = "<b>"+t['text']+"</b>"
|
||||
else:
|
||||
text = t['text']
|
||||
except:
|
||||
text = t['text']
|
||||
string = string + text
|
||||
return string
|
||||
|
||||
|
||||
def concat_texts(strings):
|
||||
'''Concatenates strings. Returns None if any of the arguments are None'''
|
||||
result = ''
|
||||
for string in strings:
|
||||
if string['text'] is None:
|
||||
return None
|
||||
result += string['text']
|
||||
return result
|
||||
|
||||
|
||||
def parse_comment(raw_comment):
|
||||
cmnt = {}
|
||||
imgHostName = urllib.parse.urlparse(raw_comment['author_avatar'][1:]).netloc
|
||||
cmnt['author'] = raw_comment['author']
|
||||
cmnt['thumbnail'] = raw_comment['author_avatar'].replace("https://{}".format(imgHostName),"")[1:] + "?host=" + imgHostName
|
||||
|
||||
print(cmnt['thumbnail'])
|
||||
cmnt['channel'] = raw_comment['author_url']
|
||||
cmnt['text'] = Markup(bleach.linkify(concat_texts(raw_comment['text']).replace("\n", "<br>")))
|
||||
cmnt['date'] = raw_comment['time_published']
|
||||
|
||||
try:
|
||||
cmnt['creatorHeart'] = raw_comment['creatorHeart']['creatorHeartRenderer']['creatorThumbnail']['thumbnails'][0][
|
||||
'url']
|
||||
except:
|
||||
cmnt['creatorHeart'] = False
|
||||
|
||||
try:
|
||||
cmnt['likes'] = raw_comment['like_count']
|
||||
except:
|
||||
cmnt['likes'] = 0
|
||||
|
||||
try:
|
||||
cmnt['replies'] = raw_comment['reply_count']
|
||||
except:
|
||||
cmnt['replies'] = 0
|
||||
return cmnt
|
||||
|
||||
|
||||
def post_process_comments_info(comments_info):
|
||||
comments = []
|
||||
for comment in comments_info['comments']:
|
||||
comments.append(parse_comment(comment))
|
||||
return comments
|
246
youtube/watch.py
Normal file
246
youtube/watch.py
Normal file
@ -0,0 +1,246 @@
|
||||
import json
|
||||
import math
|
||||
import traceback
|
||||
import urllib
|
||||
|
||||
from youtube import util, yt_data_extract
|
||||
|
||||
|
||||
def get_video_sources(info, tor_bypass=False):
|
||||
video_sources = []
|
||||
max_resolution = "720"
|
||||
for fmt in info['formats']:
|
||||
if not all(fmt[attr] for attr in ('quality', 'width', 'ext', 'url')):
|
||||
continue
|
||||
if fmt['acodec'] and fmt['vcodec'] and fmt['height'] <= max_resolution:
|
||||
video_sources.append({
|
||||
'src': fmt['url'],
|
||||
'type': 'video/' + fmt['ext'],
|
||||
'quality': fmt['quality'],
|
||||
'height': fmt['height'],
|
||||
'width': fmt['width'],
|
||||
})
|
||||
|
||||
#### order the videos sources so the preferred resolution is first ###
|
||||
|
||||
video_sources.sort(key=lambda source: source['quality'], reverse=True)
|
||||
|
||||
return video_sources
|
||||
|
||||
def make_caption_src(info, lang, auto=False, trans_lang=None):
|
||||
label = lang
|
||||
if auto:
|
||||
label += ' (Automatic)'
|
||||
if trans_lang:
|
||||
label += ' -> ' + trans_lang
|
||||
return {
|
||||
'url': '/' + yt_data_extract.get_caption_url(info, lang, 'vtt', auto, trans_lang),
|
||||
'label': label,
|
||||
'srclang': trans_lang[0:2] if trans_lang else lang[0:2],
|
||||
'on': False,
|
||||
}
|
||||
|
||||
def lang_in(lang, sequence):
|
||||
'''Tests if the language is in sequence, with e.g. en and en-US considered the same'''
|
||||
if lang is None:
|
||||
return False
|
||||
lang = lang[0:2]
|
||||
return lang in (l[0:2] for l in sequence)
|
||||
|
||||
def lang_eq(lang1, lang2):
|
||||
'''Tests if two iso 639-1 codes are equal, with en and en-US considered the same.
|
||||
Just because the codes are equal does not mean the dialects are mutually intelligible, but this will have to do for now without a complex language model'''
|
||||
if lang1 is None or lang2 is None:
|
||||
return False
|
||||
return lang1[0:2] == lang2[0:2]
|
||||
|
||||
def equiv_lang_in(lang, sequence):
|
||||
'''Extracts a language in sequence which is equivalent to lang.
|
||||
e.g. if lang is en, extracts en-GB from sequence.
|
||||
Necessary because if only a specific variant like en-GB is available, can't ask Youtube for simply en. Need to get the available variant.'''
|
||||
lang = lang[0:2]
|
||||
for l in sequence:
|
||||
if l[0:2] == lang:
|
||||
return l
|
||||
return None
|
||||
|
||||
def get_subtitle_sources(info):
|
||||
'''Returns these sources, ordered from least to most intelligible:
|
||||
native_video_lang (Automatic)
|
||||
foreign_langs (Manual)
|
||||
native_video_lang (Automatic) -> pref_lang
|
||||
foreign_langs (Manual) -> pref_lang
|
||||
native_video_lang (Manual) -> pref_lang
|
||||
pref_lang (Automatic)
|
||||
pref_lang (Manual)'''
|
||||
sources = []
|
||||
pref_lang = 'en'
|
||||
native_video_lang = None
|
||||
if info['automatic_caption_languages']:
|
||||
native_video_lang = info['automatic_caption_languages'][0]
|
||||
|
||||
highest_fidelity_is_manual = False
|
||||
|
||||
# Sources are added in very specific order outlined above
|
||||
# More intelligible sources are put further down to avoid browser bug when there are too many languages
|
||||
# (in firefox, it is impossible to select a language near the top of the list because it is cut off)
|
||||
|
||||
# native_video_lang (Automatic)
|
||||
if native_video_lang and not lang_eq(native_video_lang, pref_lang):
|
||||
sources.append(make_caption_src(info, native_video_lang, auto=True))
|
||||
|
||||
# foreign_langs (Manual)
|
||||
for lang in info['manual_caption_languages']:
|
||||
if not lang_eq(lang, pref_lang):
|
||||
sources.append(make_caption_src(info, lang))
|
||||
|
||||
if (lang_in(pref_lang, info['translation_languages'])
|
||||
and not lang_in(pref_lang, info['automatic_caption_languages'])
|
||||
and not lang_in(pref_lang, info['manual_caption_languages'])):
|
||||
# native_video_lang (Automatic) -> pref_lang
|
||||
if native_video_lang and not lang_eq(pref_lang, native_video_lang):
|
||||
sources.append(make_caption_src(info, native_video_lang, auto=True, trans_lang=pref_lang))
|
||||
|
||||
# foreign_langs (Manual) -> pref_lang
|
||||
for lang in info['manual_caption_languages']:
|
||||
if not lang_eq(lang, native_video_lang) and not lang_eq(lang, pref_lang):
|
||||
sources.append(make_caption_src(info, lang, trans_lang=pref_lang))
|
||||
|
||||
# native_video_lang (Manual) -> pref_lang
|
||||
if lang_in(native_video_lang, info['manual_caption_languages']):
|
||||
sources.append(make_caption_src(info, native_video_lang, trans_lang=pref_lang))
|
||||
|
||||
# pref_lang (Automatic)
|
||||
if lang_in(pref_lang, info['automatic_caption_languages']):
|
||||
sources.append(make_caption_src(info, equiv_lang_in(pref_lang, info['automatic_caption_languages']), auto=True))
|
||||
|
||||
# pref_lang (Manual)
|
||||
if lang_in(pref_lang, info['manual_caption_languages']):
|
||||
sources.append(make_caption_src(info, equiv_lang_in(pref_lang, info['manual_caption_languages'])))
|
||||
highest_fidelity_is_manual = True
|
||||
if len(sources) == 0:
|
||||
assert len(info['automatic_caption_languages']) == 0 and len(info['manual_caption_languages']) == 0
|
||||
|
||||
return sources
|
||||
|
||||
|
||||
def get_ordered_music_list_attributes(music_list):
|
||||
# get the set of attributes which are used by atleast 1 track
|
||||
# so there isn't an empty, extraneous album column which no tracks use, for example
|
||||
used_attributes = set()
|
||||
for track in music_list:
|
||||
used_attributes = used_attributes | track.keys()
|
||||
|
||||
# now put them in the right order
|
||||
ordered_attributes = []
|
||||
for attribute in ('Artist', 'Title', 'Album'):
|
||||
if attribute.lower() in used_attributes:
|
||||
ordered_attributes.append(attribute)
|
||||
|
||||
return ordered_attributes
|
||||
|
||||
headers = (
|
||||
('Accept', '*/*'),
|
||||
('Accept-Language', 'en-US,en;q=0.5'),
|
||||
('X-YouTube-Client-Name', '2'),
|
||||
('X-YouTube-Client-Version', '2.20180830'),
|
||||
) + util.mobile_ua
|
||||
def extract_info(video_id, use_invidious, playlist_id=None, index=None):
|
||||
# bpctr=9999999999 will bypass are-you-sure dialogs for controversial
|
||||
# videos
|
||||
url = 'https://m.youtube.com/watch?v=' + video_id + '&pbj=1&bpctr=9999999999'
|
||||
if playlist_id:
|
||||
url += '&list=' + playlist_id
|
||||
if index:
|
||||
url += '&index=' + index
|
||||
polymer_json = util.fetch_url(url, headers=headers, debug_name='watch')
|
||||
polymer_json = polymer_json.decode('utf-8')
|
||||
# TODO: Decide whether this should be done in yt_data_extract.extract_watch_info
|
||||
try:
|
||||
polymer_json = json.loads(polymer_json)
|
||||
except json.decoder.JSONDecodeError:
|
||||
traceback.print_exc()
|
||||
return {'error': 'Failed to parse json response'}
|
||||
info = yt_data_extract.extract_watch_info(polymer_json)
|
||||
|
||||
# age restriction bypass
|
||||
if info['age_restricted']:
|
||||
print('Fetching age restriction bypass page')
|
||||
data = {
|
||||
'video_id': video_id,
|
||||
'eurl': 'https://youtube.googleapis.com/v/' + video_id,
|
||||
}
|
||||
url = 'https://www.youtube.com/get_video_info?' + urllib.parse.urlencode(data)
|
||||
video_info_page = util.fetch_url(url, debug_name='get_video_info', report_text='Fetched age restriction bypass page').decode('utf-8')
|
||||
yt_data_extract.update_with_age_restricted_info(info, video_info_page)
|
||||
# check if urls ready (non-live format) in former livestream
|
||||
# urls not ready if all of them have no filesize
|
||||
if info['was_live']:
|
||||
info['urls_ready'] = False
|
||||
for fmt in info['formats']:
|
||||
if fmt['file_size'] is not None:
|
||||
info['urls_ready'] = True
|
||||
else:
|
||||
info['urls_ready'] = True
|
||||
|
||||
# livestream urls
|
||||
# sometimes only the livestream urls work soon after the livestream is over
|
||||
if (info['hls_manifest_url']
|
||||
and (info['live'] or not info['formats'] or not info['urls_ready'])
|
||||
):
|
||||
manifest = util.fetch_url(info['hls_manifest_url'],
|
||||
debug_name='hls_manifest.m3u8',
|
||||
report_text='Fetched hls manifest'
|
||||
).decode('utf-8')
|
||||
|
||||
info['hls_formats'], err = yt_data_extract.extract_hls_formats(manifest)
|
||||
if not err:
|
||||
info['playability_error'] = None
|
||||
for fmt in info['hls_formats']:
|
||||
fmt['video_quality'] = video_quality_string(fmt)
|
||||
else:
|
||||
info['hls_formats'] = []
|
||||
|
||||
# check for 403. Unnecessary for tor video routing b/c ip address is same
|
||||
info['invidious_used'] = False
|
||||
info['invidious_reload_button'] = False
|
||||
info['tor_bypass_used'] = False
|
||||
return info
|
||||
|
||||
def video_quality_string(format):
|
||||
if format['vcodec']:
|
||||
result =str(format['width'] or '?') + 'x' + str(format['height'] or '?')
|
||||
if format['fps']:
|
||||
result += ' ' + str(format['fps']) + 'fps'
|
||||
return result
|
||||
elif format['acodec']:
|
||||
return 'audio only'
|
||||
|
||||
return '?'
|
||||
|
||||
def audio_quality_string(format):
|
||||
if format['acodec']:
|
||||
result = str(format['audio_bitrate'] or '?') + 'k'
|
||||
if format['audio_sample_rate']:
|
||||
result += ' ' + str(format['audio_sample_rate']) + ' Hz'
|
||||
return result
|
||||
elif format['vcodec']:
|
||||
return 'video only'
|
||||
|
||||
return '?'
|
||||
|
||||
# from https://github.com/ytdl-org/youtube-dl/blob/master/youtube_dl/utils.py
|
||||
def format_bytes(bytes):
|
||||
if bytes is None:
|
||||
return 'N/A'
|
||||
if type(bytes) is str:
|
||||
bytes = float(bytes)
|
||||
if bytes == 0.0:
|
||||
exponent = 0
|
||||
else:
|
||||
exponent = int(math.log(bytes, 1024.0))
|
||||
suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
|
||||
converted = float(bytes) / float(1024 ** exponent)
|
||||
return '%.2f%s' % (converted, suffix)
|
||||
|
||||
|
12
youtube/yt_data_extract/__init__.py
Normal file
12
youtube/yt_data_extract/__init__.py
Normal file
@ -0,0 +1,12 @@
|
||||
from .common import (get, multi_get, deep_get, multi_deep_get,
|
||||
liberal_update, conservative_update, remove_redirect, normalize_url,
|
||||
extract_str, extract_formatted_text, extract_int, extract_approx_int,
|
||||
extract_date, extract_item_info, extract_items, extract_response)
|
||||
|
||||
from .everything_else import (extract_channel_info, extract_search_info,
|
||||
extract_playlist_metadata, extract_playlist_info, extract_comments_info)
|
||||
|
||||
from .watch_extraction import (extract_watch_info, get_caption_url,
|
||||
update_with_age_restricted_info, requires_decryption,
|
||||
extract_decryption_function, decrypt_signatures, _formats,
|
||||
update_format_with_type_info, extract_hls_formats)
|
470
youtube/yt_data_extract/common.py
Normal file
470
youtube/yt_data_extract/common.py
Normal file
@ -0,0 +1,470 @@
|
||||
import re
|
||||
import urllib.parse
|
||||
import collections
|
||||
|
||||
def get(object, key, default=None, types=()):
|
||||
'''Like dict.get(), but returns default if the result doesn't match one of the types.
|
||||
Also works for indexing lists.'''
|
||||
try:
|
||||
result = object[key]
|
||||
except (TypeError, IndexError, KeyError):
|
||||
return default
|
||||
|
||||
if not types or isinstance(result, types):
|
||||
return result
|
||||
else:
|
||||
return default
|
||||
|
||||
def multi_get(object, *keys, default=None, types=()):
|
||||
'''Like get, but try other keys if the first fails'''
|
||||
for key in keys:
|
||||
try:
|
||||
result = object[key]
|
||||
except (TypeError, IndexError, KeyError):
|
||||
pass
|
||||
else:
|
||||
if not types or isinstance(result, types):
|
||||
return result
|
||||
else:
|
||||
continue
|
||||
return default
|
||||
|
||||
|
||||
def deep_get(object, *keys, default=None, types=()):
|
||||
'''Like dict.get(), but for nested dictionaries/sequences, supporting keys or indices.
|
||||
Last argument is the default value to use in case of any IndexErrors or KeyErrors.
|
||||
If types is given and the result doesn't match one of those types, default is returned'''
|
||||
try:
|
||||
for key in keys:
|
||||
object = object[key]
|
||||
except (TypeError, IndexError, KeyError):
|
||||
return default
|
||||
else:
|
||||
if not types or isinstance(object, types):
|
||||
return object
|
||||
else:
|
||||
return default
|
||||
|
||||
def multi_deep_get(object, *key_sequences, default=None, types=()):
|
||||
'''Like deep_get, but can try different key sequences in case one fails.
|
||||
Return default if all of them fail. key_sequences is a list of lists'''
|
||||
for key_sequence in key_sequences:
|
||||
_object = object
|
||||
try:
|
||||
for key in key_sequence:
|
||||
_object = _object[key]
|
||||
except (TypeError, IndexError, KeyError):
|
||||
pass
|
||||
else:
|
||||
if not types or isinstance(_object, types):
|
||||
return _object
|
||||
else:
|
||||
continue
|
||||
return default
|
||||
|
||||
def liberal_update(obj, key, value):
|
||||
'''Updates obj[key] with value as long as value is not None.
|
||||
Ensures obj[key] will at least get a value of None, however'''
|
||||
if (value is not None) or (key not in obj):
|
||||
obj[key] = value
|
||||
|
||||
def conservative_update(obj, key, value):
|
||||
'''Only updates obj if it doesn't have key or obj[key] is None'''
|
||||
if obj.get(key) is None:
|
||||
obj[key] = value
|
||||
|
||||
def concat_or_none(*strings):
|
||||
'''Concatenates strings. Returns None if any of the arguments are None'''
|
||||
result = ''
|
||||
for string in strings:
|
||||
if string is None:
|
||||
return None
|
||||
result += string
|
||||
return result
|
||||
|
||||
def remove_redirect(url):
|
||||
if url is None:
|
||||
return None
|
||||
if re.fullmatch(r'(((https?:)?//)?(www.)?youtube.com)?/redirect\?.*', url) is not None: # youtube puts these on external links to do tracking
|
||||
query_string = url[url.find('?')+1: ]
|
||||
return urllib.parse.parse_qs(query_string)['q'][0]
|
||||
return url
|
||||
|
||||
youtube_url_re = re.compile(r'^(?:(?:(?:https?:)?//)?(?:www\.)?youtube\.com)?(/.*)$')
|
||||
def normalize_url(url):
|
||||
if url is None:
|
||||
return None
|
||||
match = youtube_url_re.fullmatch(url)
|
||||
if match is None:
|
||||
raise Exception()
|
||||
|
||||
return 'https://www.youtube.com' + match.group(1)
|
||||
|
||||
def _recover_urls(runs):
|
||||
for run in runs:
|
||||
url = deep_get(run, 'navigationEndpoint', 'urlEndpoint', 'url')
|
||||
text = run.get('text', '')
|
||||
# second condition is necessary because youtube makes other things into urls, such as hashtags, which we want to keep as text
|
||||
if url is not None and (text.startswith('http://') or text.startswith('https://')):
|
||||
url = remove_redirect(url)
|
||||
run['url'] = url
|
||||
run['text'] = url # youtube truncates the url text, use actual url instead
|
||||
|
||||
def extract_str(node, default=None, recover_urls=False):
|
||||
'''default is the value returned if the extraction fails. If recover_urls is true, will attempt to fix Youtube's truncation of url text (most prominently seen in descriptions)'''
|
||||
if isinstance(node, str):
|
||||
return node
|
||||
|
||||
try:
|
||||
return node['simpleText']
|
||||
except (KeyError, TypeError):
|
||||
pass
|
||||
|
||||
if isinstance(node, dict) and 'runs' in node:
|
||||
if recover_urls:
|
||||
_recover_urls(node['runs'])
|
||||
return ''.join(text_run.get('text', '') for text_run in node['runs'])
|
||||
|
||||
return default
|
||||
|
||||
def extract_formatted_text(node):
|
||||
if not node:
|
||||
return []
|
||||
if 'runs' in node:
|
||||
_recover_urls(node['runs'])
|
||||
return node['runs']
|
||||
elif 'simpleText' in node:
|
||||
return [{'text': node['simpleText']}]
|
||||
return []
|
||||
|
||||
def extract_int(string, default=None):
|
||||
if isinstance(string, int):
|
||||
return string
|
||||
if not isinstance(string, str):
|
||||
string = extract_str(string)
|
||||
if not string:
|
||||
return default
|
||||
match = re.search(r'\b(\d+)\b', string.replace(',', ''))
|
||||
if match is None:
|
||||
return default
|
||||
try:
|
||||
return int(match.group(1))
|
||||
except ValueError:
|
||||
return default
|
||||
|
||||
def extract_approx_int(string):
|
||||
'''e.g. "15.1M" from "15.1M subscribers"'''
|
||||
if not isinstance(string, str):
|
||||
string = extract_str(string)
|
||||
if not string:
|
||||
return None
|
||||
match = re.search(r'\b(\d+(?:\.\d+)?[KMBTkmbt]?)\b', string.replace(',', ''))
|
||||
if match is None:
|
||||
return None
|
||||
return match.group(1)
|
||||
|
||||
MONTH_ABBREVIATIONS = {'jan':'1', 'feb':'2', 'mar':'3', 'apr':'4', 'may':'5', 'jun':'6', 'jul':'7', 'aug':'8', 'sep':'9', 'oct':'10', 'nov':'11', 'dec':'12'}
|
||||
def extract_date(date_text):
|
||||
'''Input: "Mar 9, 2019". Output: "2019-3-9"'''
|
||||
if not isinstance(date_text, str):
|
||||
date_text = extract_str(date_text)
|
||||
if date_text is None:
|
||||
return None
|
||||
|
||||
date_text = date_text.replace(',', '').lower()
|
||||
parts = date_text.split()
|
||||
if len(parts) >= 3:
|
||||
month, day, year = parts[-3:]
|
||||
month = MONTH_ABBREVIATIONS.get(month[0:3]) # slicing in case they start writing out the full month name
|
||||
if month and (re.fullmatch(r'\d\d?', day) is not None) and (re.fullmatch(r'\d{4}', year) is not None):
|
||||
return year + '-' + month + '-' + day
|
||||
return None
|
||||
|
||||
def check_missing_keys(object, *key_sequences):
|
||||
for key_sequence in key_sequences:
|
||||
_object = object
|
||||
try:
|
||||
for key in key_sequence:
|
||||
_object = _object[key]
|
||||
except (KeyError, IndexError, TypeError):
|
||||
return 'Could not find ' + key
|
||||
|
||||
return None
|
||||
|
||||
def extract_item_info(item, additional_info={}):
|
||||
if not item:
|
||||
return {'error': 'No item given'}
|
||||
|
||||
type = get(list(item.keys()), 0)
|
||||
if not type:
|
||||
return {'error': 'Could not find type'}
|
||||
item = item[type]
|
||||
|
||||
info = {'error': None}
|
||||
if type in ('itemSectionRenderer', 'compactAutoplayRenderer'):
|
||||
return extract_item_info(deep_get(item, 'contents', 0), additional_info)
|
||||
|
||||
if type in ('movieRenderer', 'clarificationRenderer'):
|
||||
info['type'] = 'unsupported'
|
||||
return info
|
||||
|
||||
info.update(additional_info)
|
||||
|
||||
# type looks like e.g. 'compactVideoRenderer' or 'gridVideoRenderer'
|
||||
# camelCase split, https://stackoverflow.com/a/37697078
|
||||
type_parts = [s.lower() for s in re.sub(r'([A-Z][a-z]+)', r' \1', type).split()]
|
||||
if len(type_parts) < 2:
|
||||
info['type'] = 'unsupported'
|
||||
return
|
||||
primary_type = type_parts[-2]
|
||||
if primary_type == 'video':
|
||||
info['type'] = 'video'
|
||||
elif primary_type in ('playlist', 'radio', 'show'):
|
||||
info['type'] = 'playlist'
|
||||
elif primary_type == 'channel':
|
||||
info['type'] = 'channel'
|
||||
elif type == 'videoWithContextRenderer': # stupid exception
|
||||
info['type'] = 'video'
|
||||
primary_type = 'video'
|
||||
else:
|
||||
info['type'] = 'unsupported'
|
||||
|
||||
# videoWithContextRenderer changes it to 'headline' just to be annoying
|
||||
info['title'] = extract_str(multi_get(item, 'title', 'headline'))
|
||||
if primary_type != 'channel':
|
||||
info['author'] = extract_str(multi_get(item, 'longBylineText', 'shortBylineText', 'ownerText'))
|
||||
info['author_id'] = extract_str(multi_deep_get(item,
|
||||
['longBylineText', 'runs', 0, 'navigationEndpoint', 'browseEndpoint', 'browseId'],
|
||||
['shortBylineText', 'runs', 0, 'navigationEndpoint', 'browseEndpoint', 'browseId'],
|
||||
['ownerText', 'runs', 0, 'navigationEndpoint', 'browseEndpoint', 'browseId']
|
||||
))
|
||||
info['author_url'] = ('https://www.youtube.com/channel/' + info['author_id']) if info['author_id'] else None
|
||||
info['description'] = extract_formatted_text(multi_get(item, 'descriptionSnippet', 'descriptionText'))
|
||||
info['thumbnail'] = multi_deep_get(item,
|
||||
['thumbnail', 'thumbnails', 0, 'url'], # videos
|
||||
['thumbnails', 0, 'thumbnails', 0, 'url'], # playlists
|
||||
['thumbnailRenderer', 'showCustomThumbnailRenderer', 'thumbnail', 'thumbnails', 0, 'url'], # shows
|
||||
)
|
||||
|
||||
info['badges'] = []
|
||||
for badge_node in multi_get(item, 'badges', 'ownerBadges', default=()):
|
||||
badge = deep_get(badge_node, 'metadataBadgeRenderer', 'label')
|
||||
if badge:
|
||||
info['badges'].append(badge)
|
||||
|
||||
if primary_type in ('video', 'playlist'):
|
||||
info['time_published'] = None
|
||||
timestamp = re.search(r'(\d+ \w+ ago)',
|
||||
extract_str(item.get('publishedTimeText'), default=''))
|
||||
if timestamp:
|
||||
info['time_published'] = timestamp.group(1)
|
||||
|
||||
if primary_type == 'video':
|
||||
info['id'] = item.get('videoId')
|
||||
info['view_count'] = extract_int(item.get('viewCountText'))
|
||||
|
||||
# dig into accessibility data to get view_count for videos marked as recommended, and to get time_published
|
||||
accessibility_label = multi_deep_get(item,
|
||||
['title', 'accessibility', 'accessibilityData', 'label'],
|
||||
['headline', 'accessibility', 'accessibilityData', 'label'],
|
||||
default='')
|
||||
timestamp = re.search(r'(\d+ \w+ ago)', accessibility_label)
|
||||
if timestamp:
|
||||
conservative_update(info, 'time_published', timestamp.group(1))
|
||||
view_count = re.search(r'(\d+) views', accessibility_label.replace(',', ''))
|
||||
if view_count:
|
||||
conservative_update(info, 'view_count', int(view_count.group(1)))
|
||||
|
||||
if info['view_count']:
|
||||
info['approx_view_count'] = '{:,}'.format(info['view_count'])
|
||||
else:
|
||||
info['approx_view_count'] = extract_approx_int(item.get('shortViewCountText'))
|
||||
|
||||
# handle case where it is "No views"
|
||||
if not info['approx_view_count']:
|
||||
if ('No views' in item.get('shortViewCountText', '')
|
||||
or 'no views' in accessibility_label.lower()):
|
||||
info['view_count'] = 0
|
||||
info['approx_view_count'] = '0'
|
||||
|
||||
info['duration'] = extract_str(item.get('lengthText'))
|
||||
|
||||
# if it's an item in a playlist, get its index
|
||||
if 'index' in item: # url has wrong index on playlist page
|
||||
info['index'] = extract_int(item.get('index'))
|
||||
elif 'indexText' in item:
|
||||
# Current item in playlist has ▶ instead of the actual index, must
|
||||
# dig into url
|
||||
match = re.search(r'index=(\d+)', deep_get(item,
|
||||
'navigationEndpoint', 'commandMetadata', 'webCommandMetadata',
|
||||
'url', default=''))
|
||||
if match is None: # worth a try then
|
||||
info['index'] = extract_int(item.get('indexText'))
|
||||
else:
|
||||
info['index'] = int(match.group(1))
|
||||
else:
|
||||
info['index'] = None
|
||||
|
||||
elif primary_type in ('playlist', 'radio'):
|
||||
info['id'] = item.get('playlistId')
|
||||
info['video_count'] = extract_int(item.get('videoCount'))
|
||||
elif primary_type == 'channel':
|
||||
info['id'] = item.get('channelId')
|
||||
info['approx_subscriber_count'] = extract_approx_int(item.get('subscriberCountText'))
|
||||
elif primary_type == 'show':
|
||||
info['id'] = deep_get(item, 'navigationEndpoint', 'watchEndpoint', 'playlistId')
|
||||
|
||||
if primary_type in ('playlist', 'channel'):
|
||||
conservative_update(info, 'video_count', extract_int(item.get('videoCountText')))
|
||||
|
||||
for overlay in item.get('thumbnailOverlays', []):
|
||||
conservative_update(info, 'duration', extract_str(deep_get(
|
||||
overlay, 'thumbnailOverlayTimeStatusRenderer', 'text'
|
||||
)))
|
||||
# show renderers don't have videoCountText
|
||||
conservative_update(info, 'video_count', extract_int(deep_get(
|
||||
overlay, 'thumbnailOverlayBottomPanelRenderer', 'text'
|
||||
)))
|
||||
return info
|
||||
|
||||
def extract_response(polymer_json):
|
||||
'''return response, error'''
|
||||
response = multi_deep_get(polymer_json, [1, 'response'], ['response'])
|
||||
if response is None:
|
||||
return None, 'Failed to extract response'
|
||||
else:
|
||||
return response, None
|
||||
|
||||
|
||||
_item_types = {
|
||||
'movieRenderer',
|
||||
'didYouMeanRenderer',
|
||||
'showingResultsForRenderer',
|
||||
|
||||
'videoRenderer',
|
||||
'compactVideoRenderer',
|
||||
'compactAutoplayRenderer',
|
||||
'videoWithContextRenderer',
|
||||
'gridVideoRenderer',
|
||||
'playlistVideoRenderer',
|
||||
|
||||
'playlistRenderer',
|
||||
'compactPlaylistRenderer',
|
||||
'gridPlaylistRenderer',
|
||||
|
||||
'radioRenderer',
|
||||
'compactRadioRenderer',
|
||||
'gridRadioRenderer',
|
||||
|
||||
'showRenderer',
|
||||
'compactShowRenderer',
|
||||
'gridShowRenderer',
|
||||
|
||||
|
||||
'channelRenderer',
|
||||
'compactChannelRenderer',
|
||||
'gridChannelRenderer',
|
||||
}
|
||||
|
||||
def _traverse_browse_renderer(renderer):
|
||||
for tab in get(renderer, 'tabs', ()):
|
||||
tab_renderer = multi_get(tab, 'tabRenderer', 'expandableTabRenderer')
|
||||
if tab_renderer is None:
|
||||
continue
|
||||
if tab_renderer.get('selected', False):
|
||||
return get(tab_renderer, 'content', {})
|
||||
print('Could not find tab with content')
|
||||
return {}
|
||||
|
||||
def _traverse_standard_list(renderer):
|
||||
renderer_list = multi_get(renderer, 'contents', 'items', default=())
|
||||
continuation = deep_get(renderer, 'continuations', 0, 'nextContinuationData', 'continuation')
|
||||
return renderer_list, continuation
|
||||
|
||||
# these renderers contain one inside them
|
||||
nested_renderer_dispatch = {
|
||||
'singleColumnBrowseResultsRenderer': _traverse_browse_renderer,
|
||||
'twoColumnBrowseResultsRenderer': _traverse_browse_renderer,
|
||||
'twoColumnSearchResultsRenderer': lambda renderer: get(renderer, 'primaryContents', {}),
|
||||
}
|
||||
|
||||
# these renderers contain a list of renderers inside them
|
||||
nested_renderer_list_dispatch = {
|
||||
'sectionListRenderer': _traverse_standard_list,
|
||||
'itemSectionRenderer': _traverse_standard_list,
|
||||
'gridRenderer': _traverse_standard_list,
|
||||
'playlistVideoListRenderer': _traverse_standard_list,
|
||||
'singleColumnWatchNextResults': lambda r: (deep_get(r, 'results', 'results', 'contents', default=[]), None),
|
||||
}
|
||||
def get_nested_renderer_list_function(key):
|
||||
if key in nested_renderer_list_dispatch:
|
||||
return nested_renderer_list_dispatch[key]
|
||||
elif key.endswith('Continuation'):
|
||||
return _traverse_standard_list
|
||||
return None
|
||||
|
||||
def extract_items_from_renderer(renderer, item_types=_item_types):
|
||||
ctoken = None
|
||||
items = []
|
||||
|
||||
iter_stack = collections.deque()
|
||||
current_iter = iter(())
|
||||
|
||||
while True:
|
||||
# mode 1: get a new renderer by iterating.
|
||||
# goes down the stack for an iterator if one has been exhausted
|
||||
if not renderer:
|
||||
try:
|
||||
renderer = current_iter.__next__()
|
||||
except StopIteration:
|
||||
try:
|
||||
current_iter = iter_stack.pop()
|
||||
except IndexError:
|
||||
return items, ctoken
|
||||
# Get new renderer or check that the one we got is good before
|
||||
# proceeding to mode 2
|
||||
continue
|
||||
|
||||
|
||||
# mode 2: dig into the current renderer
|
||||
key, value = list(renderer.items())[0]
|
||||
|
||||
# the renderer is an item
|
||||
if key in item_types:
|
||||
items.append(renderer)
|
||||
|
||||
# has a list in it, add it to the iter stack
|
||||
elif get_nested_renderer_list_function(key):
|
||||
renderer_list, cont = get_nested_renderer_list_function(key)(value)
|
||||
if renderer_list:
|
||||
iter_stack.append(current_iter)
|
||||
current_iter = iter(renderer_list)
|
||||
if cont:
|
||||
ctoken = cont
|
||||
|
||||
# new renderer nested inside this one
|
||||
elif key in nested_renderer_dispatch:
|
||||
renderer = nested_renderer_dispatch[key](value)
|
||||
continue # don't reset renderer to None
|
||||
|
||||
renderer = None
|
||||
|
||||
def extract_items(response, item_types=_item_types):
|
||||
'''return items, ctoken'''
|
||||
if 'continuationContents' in response:
|
||||
# sometimes there's another, empty, junk [something]Continuation key
|
||||
# find real one
|
||||
for key, renderer_cont in get(response,
|
||||
'continuationContents', {}).items():
|
||||
# e.g. commentSectionContinuation, playlistVideoListContinuation
|
||||
if key.endswith('Continuation'):
|
||||
items, cont = extract_items_from_renderer({key: renderer_cont},
|
||||
item_types=item_types)
|
||||
if items:
|
||||
return items, cont
|
||||
return [], None
|
||||
elif 'contents' in response:
|
||||
renderer = get(response, 'contents', {})
|
||||
return extract_items_from_renderer(renderer, item_types=item_types)
|
||||
else:
|
||||
return [], None
|
281
youtube/yt_data_extract/everything_else.py
Normal file
281
youtube/yt_data_extract/everything_else.py
Normal file
@ -0,0 +1,281 @@
|
||||
from .common import (get, multi_get, deep_get, multi_deep_get,
|
||||
liberal_update, conservative_update, remove_redirect, normalize_url,
|
||||
extract_str, extract_formatted_text, extract_int, extract_approx_int,
|
||||
extract_date, check_missing_keys, extract_item_info, extract_items,
|
||||
extract_response)
|
||||
from youtube import proto
|
||||
|
||||
import re
|
||||
import urllib
|
||||
from math import ceil
|
||||
|
||||
def extract_channel_info(polymer_json, tab):
|
||||
response, err = extract_response(polymer_json)
|
||||
if err:
|
||||
return {'error': err}
|
||||
|
||||
|
||||
metadata = deep_get(response, 'metadata', 'channelMetadataRenderer',
|
||||
default={})
|
||||
if not metadata:
|
||||
metadata = deep_get(response, 'microformat', 'microformatDataRenderer',
|
||||
default={})
|
||||
|
||||
# channel doesn't exist or was terminated
|
||||
# example terminated channel: https://www.youtube.com/channel/UCnKJeK_r90jDdIuzHXC0Org
|
||||
if not metadata:
|
||||
if response.get('alerts'):
|
||||
error_string = ' '.join(
|
||||
extract_str(deep_get(alert, 'alertRenderer', 'text'), default='')
|
||||
for alert in response['alerts']
|
||||
)
|
||||
if not error_string:
|
||||
error_string = 'Failed to extract error'
|
||||
return {'error': error_string}
|
||||
elif deep_get(response, 'responseContext', 'errors'):
|
||||
for error in response['responseContext']['errors'].get('error', []):
|
||||
if error.get('code') == 'INVALID_VALUE' and error.get('location') == 'browse_id':
|
||||
return {'error': 'This channel does not exist'}
|
||||
return {'error': 'Failure getting metadata'}
|
||||
|
||||
info = {'error': None}
|
||||
info['current_tab'] = tab
|
||||
|
||||
info['approx_subscriber_count'] = extract_approx_int(deep_get(response,
|
||||
'header', 'c4TabbedHeaderRenderer', 'subscriberCountText'))
|
||||
|
||||
# stuff from microformat (info given by youtube for every page on channel)
|
||||
info['short_description'] = metadata.get('description')
|
||||
if info['short_description'] and len(info['short_description']) > 730:
|
||||
info['short_description'] = info['short_description'][0:730] + '...'
|
||||
info['channel_name'] = metadata.get('title')
|
||||
info['avatar'] = multi_deep_get(metadata,
|
||||
['avatar', 'thumbnails', 0, 'url'],
|
||||
['thumbnail', 'thumbnails', 0, 'url'],
|
||||
)
|
||||
channel_url = multi_get(metadata, 'urlCanonical', 'channelUrl')
|
||||
if channel_url:
|
||||
channel_id = get(channel_url.rstrip('/').split('/'), -1)
|
||||
info['channel_id'] = channel_id
|
||||
else:
|
||||
info['channel_id'] = metadata.get('externalId')
|
||||
if info['channel_id']:
|
||||
info['channel_url'] = 'https://www.youtube.com/channel/' + channel_id
|
||||
else:
|
||||
info['channel_url'] = None
|
||||
|
||||
# get items
|
||||
info['items'] = []
|
||||
|
||||
# empty channel
|
||||
if 'contents' not in response and 'continuationContents' not in response:
|
||||
return info
|
||||
|
||||
if tab in ('videos', 'playlists', 'search'):
|
||||
items, ctoken = extract_items(response)
|
||||
additional_info = {'author': info['channel_name'], 'author_url': info['channel_url']}
|
||||
info['items'] = [extract_item_info(renderer, additional_info) for renderer in items]
|
||||
if tab == 'search':
|
||||
info['is_last_page'] = (ctoken is None)
|
||||
elif tab == 'about':
|
||||
items, _ = extract_items(response, item_types={'channelAboutFullMetadataRenderer'})
|
||||
if not items:
|
||||
info['error'] = 'Could not find channelAboutFullMetadataRenderer'
|
||||
return info
|
||||
channel_metadata = items[0]['channelAboutFullMetadataRenderer']
|
||||
|
||||
info['links'] = []
|
||||
for link_json in channel_metadata.get('primaryLinks', ()):
|
||||
url = remove_redirect(deep_get(link_json, 'navigationEndpoint', 'urlEndpoint', 'url'))
|
||||
text = extract_str(link_json.get('title'))
|
||||
info['links'].append( (text, url) )
|
||||
|
||||
info['date_joined'] = extract_date(channel_metadata.get('joinedDateText'))
|
||||
info['view_count'] = extract_int(channel_metadata.get('viewCountText'))
|
||||
info['description'] = extract_str(channel_metadata.get('description'), default='')
|
||||
else:
|
||||
raise NotImplementedError('Unknown or unsupported channel tab: ' + tab)
|
||||
|
||||
return info
|
||||
|
||||
def extract_search_info(polymer_json):
|
||||
response, err = extract_response(polymer_json)
|
||||
if err:
|
||||
return {'error': err}
|
||||
info = {'error': None}
|
||||
info['estimated_results'] = int(response['estimatedResults'])
|
||||
info['estimated_pages'] = ceil(info['estimated_results']/20)
|
||||
|
||||
|
||||
results, _ = extract_items(response)
|
||||
|
||||
|
||||
info['items'] = []
|
||||
info['corrections'] = {'type': None}
|
||||
for renderer in results:
|
||||
type = list(renderer.keys())[0]
|
||||
if type == 'shelfRenderer':
|
||||
continue
|
||||
if type == 'didYouMeanRenderer':
|
||||
renderer = renderer[type]
|
||||
|
||||
info['corrections'] = {
|
||||
'type': 'did_you_mean',
|
||||
'corrected_query': renderer['correctedQueryEndpoint']['searchEndpoint']['query'],
|
||||
'corrected_query_text': renderer['correctedQuery']['runs'],
|
||||
}
|
||||
continue
|
||||
if type == 'showingResultsForRenderer':
|
||||
renderer = renderer[type]
|
||||
|
||||
info['corrections'] = {
|
||||
'type': 'showing_results_for',
|
||||
'corrected_query_text': renderer['correctedQuery']['runs'],
|
||||
'original_query_text': renderer['originalQuery']['simpleText'],
|
||||
}
|
||||
continue
|
||||
|
||||
i_info = extract_item_info(renderer)
|
||||
if i_info.get('type') != 'unsupported':
|
||||
info['items'].append(i_info)
|
||||
|
||||
|
||||
return info
|
||||
|
||||
def extract_playlist_metadata(polymer_json):
|
||||
response, err = extract_response(polymer_json)
|
||||
if err:
|
||||
return {'error': err}
|
||||
|
||||
metadata = {'error': None}
|
||||
header = deep_get(response, 'header', 'playlistHeaderRenderer', default={})
|
||||
metadata['title'] = extract_str(header.get('title'))
|
||||
|
||||
metadata['first_video_id'] = deep_get(header, 'playEndpoint', 'watchEndpoint', 'videoId')
|
||||
first_id = re.search(r'([a-z_\-]{11})', deep_get(header,
|
||||
'thumbnail', 'thumbnails', 0, 'url', default=''))
|
||||
if first_id:
|
||||
conservative_update(metadata, 'first_video_id', first_id.group(1))
|
||||
if metadata['first_video_id'] is None:
|
||||
metadata['thumbnail'] = None
|
||||
else:
|
||||
metadata['thumbnail'] = 'https://i.ytimg.com/vi/' + metadata['first_video_id'] + '/mqdefault.jpg'
|
||||
|
||||
metadata['video_count'] = extract_int(header.get('numVideosText'))
|
||||
metadata['description'] = extract_str(header.get('descriptionText'), default='')
|
||||
metadata['author'] = extract_str(header.get('ownerText'))
|
||||
metadata['author_id'] = multi_deep_get(header,
|
||||
['ownerText', 'runs', 0, 'navigationEndpoint', 'browseEndpoint', 'browseId'],
|
||||
['ownerEndpoint', 'browseEndpoint', 'browseId'])
|
||||
if metadata['author_id']:
|
||||
metadata['author_url'] = 'https://www.youtube.com/channel/' + metadata['author_id']
|
||||
else:
|
||||
metadata['author_url'] = None
|
||||
metadata['view_count'] = extract_int(header.get('viewCountText'))
|
||||
metadata['like_count'] = extract_int(header.get('likesCountWithoutLikeText'))
|
||||
for stat in header.get('stats', ()):
|
||||
text = extract_str(stat)
|
||||
if 'videos' in text:
|
||||
conservative_update(metadata, 'video_count', extract_int(text))
|
||||
elif 'views' in text:
|
||||
conservative_update(metadata, 'view_count', extract_int(text))
|
||||
elif 'updated' in text:
|
||||
metadata['time_published'] = extract_date(text)
|
||||
|
||||
return metadata
|
||||
|
||||
def extract_playlist_info(polymer_json):
|
||||
response, err = extract_response(polymer_json)
|
||||
if err:
|
||||
return {'error': err}
|
||||
info = {'error': None}
|
||||
first_page = 'continuationContents' not in response
|
||||
video_list, _ = extract_items(response)
|
||||
|
||||
info['items'] = [extract_item_info(renderer) for renderer in video_list]
|
||||
|
||||
if first_page:
|
||||
info['metadata'] = extract_playlist_metadata(polymer_json)
|
||||
|
||||
return info
|
||||
|
||||
def _ctoken_metadata(ctoken):
|
||||
result = dict()
|
||||
params = proto.parse(proto.b64_to_bytes(ctoken))
|
||||
result['video_id'] = proto.parse(params[2])[2].decode('ascii')
|
||||
|
||||
offset_information = proto.parse(params[6])
|
||||
result['offset'] = offset_information.get(5, 0)
|
||||
|
||||
result['is_replies'] = False
|
||||
if (3 in offset_information) and (2 in proto.parse(offset_information[3])):
|
||||
result['is_replies'] = True
|
||||
result['sort'] = None
|
||||
else:
|
||||
try:
|
||||
result['sort'] = proto.parse(offset_information[4])[6]
|
||||
except KeyError:
|
||||
result['sort'] = 0
|
||||
return result
|
||||
|
||||
def extract_comments_info(polymer_json):
|
||||
response, err = extract_response(polymer_json)
|
||||
if err:
|
||||
return {'error': err}
|
||||
info = {'error': None}
|
||||
|
||||
url = multi_deep_get(polymer_json, [1, 'url'], ['url'])
|
||||
if url:
|
||||
ctoken = urllib.parse.parse_qs(url[url.find('?')+1:])['ctoken'][0]
|
||||
metadata = _ctoken_metadata(ctoken)
|
||||
else:
|
||||
metadata = {}
|
||||
info['video_id'] = metadata.get('video_id')
|
||||
info['offset'] = metadata.get('offset')
|
||||
info['is_replies'] = metadata.get('is_replies')
|
||||
info['sort'] = metadata.get('sort')
|
||||
info['video_title'] = None
|
||||
|
||||
comments, ctoken = extract_items(response,
|
||||
item_types={'commentThreadRenderer', 'commentRenderer'})
|
||||
info['comments'] = []
|
||||
info['ctoken'] = ctoken
|
||||
for comment in comments:
|
||||
comment_info = {}
|
||||
|
||||
if 'commentThreadRenderer' in comment: # top level comments
|
||||
conservative_update(info, 'is_replies', False)
|
||||
comment_thread = comment['commentThreadRenderer']
|
||||
info['video_title'] = extract_str(comment_thread.get('commentTargetTitle'))
|
||||
if 'replies' not in comment_thread:
|
||||
comment_info['reply_count'] = 0
|
||||
else:
|
||||
comment_info['reply_count'] = extract_int(deep_get(comment_thread,
|
||||
'replies', 'commentRepliesRenderer', 'moreText'
|
||||
), default=1) # With 1 reply, the text reads "View reply"
|
||||
comment_renderer = deep_get(comment_thread, 'comment', 'commentRenderer', default={})
|
||||
elif 'commentRenderer' in comment: # replies
|
||||
comment_info['reply_count'] = 0 # replyCount, below, not present for replies even if the reply has further replies to it
|
||||
conservative_update(info, 'is_replies', True)
|
||||
comment_renderer = comment['commentRenderer']
|
||||
else:
|
||||
comment_renderer = {}
|
||||
|
||||
# These 3 are sometimes absent, likely because the channel was deleted
|
||||
comment_info['author'] = extract_str(comment_renderer.get('authorText'))
|
||||
comment_info['author_url'] = deep_get(comment_renderer,
|
||||
'authorEndpoint', 'commandMetadata', 'webCommandMetadata', 'url')
|
||||
comment_info['author_id'] = deep_get(comment_renderer,
|
||||
'authorEndpoint', 'browseEndpoint', 'browseId')
|
||||
|
||||
comment_info['author_avatar'] = deep_get(comment_renderer,
|
||||
'authorThumbnail', 'thumbnails', 0, 'url')
|
||||
comment_info['id'] = comment_renderer.get('commentId')
|
||||
comment_info['text'] = extract_formatted_text(comment_renderer.get('contentText'))
|
||||
comment_info['time_published'] = extract_str(comment_renderer.get('publishedTimeText'))
|
||||
comment_info['like_count'] = comment_renderer.get('likeCount')
|
||||
liberal_update(comment_info, 'reply_count', comment_renderer.get('replyCount'))
|
||||
|
||||
info['comments'].append(comment_info)
|
||||
|
||||
return info
|
689
youtube/yt_data_extract/watch_extraction.py
Normal file
689
youtube/yt_data_extract/watch_extraction.py
Normal file
@ -0,0 +1,689 @@
|
||||
from .common import (get, multi_get, deep_get, multi_deep_get,
|
||||
liberal_update, conservative_update, remove_redirect, normalize_url,
|
||||
extract_str, extract_formatted_text, extract_int, extract_approx_int,
|
||||
extract_date, check_missing_keys, extract_item_info, extract_items,
|
||||
extract_response, concat_or_none)
|
||||
|
||||
import json
|
||||
import urllib.parse
|
||||
import traceback
|
||||
import re
|
||||
|
||||
# from https://github.com/ytdl-org/youtube-dl/blob/master/youtube_dl/extractor/youtube.py
|
||||
_formats = {
|
||||
'5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'audio_bitrate': 64, 'vcodec': 'h263'},
|
||||
'6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'audio_bitrate': 64, 'vcodec': 'h263'},
|
||||
'13': {'ext': '3gp', 'acodec': 'aac', 'vcodec': 'mp4v'},
|
||||
'17': {'ext': '3gp', 'width': 176, 'height': 144, 'acodec': 'aac', 'audio_bitrate': 24, 'vcodec': 'mp4v'},
|
||||
'18': {'ext': 'mp4', 'width': 640, 'height': 360, 'acodec': 'aac', 'audio_bitrate': 96, 'vcodec': 'h264'},
|
||||
'22': {'ext': 'mp4', 'width': 1280, 'height': 720, 'acodec': 'aac', 'audio_bitrate': 192, 'vcodec': 'h264'},
|
||||
'34': {'ext': 'flv', 'width': 640, 'height': 360, 'acodec': 'aac', 'audio_bitrate': 128, 'vcodec': 'h264'},
|
||||
'35': {'ext': 'flv', 'width': 854, 'height': 480, 'acodec': 'aac', 'audio_bitrate': 128, 'vcodec': 'h264'},
|
||||
# itag 36 videos are either 320x180 (BaW_jenozKc) or 320x240 (__2ABJjxzNo), audio_bitrate varies as well
|
||||
'36': {'ext': '3gp', 'width': 320, 'acodec': 'aac', 'vcodec': 'mp4v'},
|
||||
'37': {'ext': 'mp4', 'width': 1920, 'height': 1080, 'acodec': 'aac', 'audio_bitrate': 192, 'vcodec': 'h264'},
|
||||
'38': {'ext': 'mp4', 'width': 4096, 'height': 3072, 'acodec': 'aac', 'audio_bitrate': 192, 'vcodec': 'h264'},
|
||||
'43': {'ext': 'webm', 'width': 640, 'height': 360, 'acodec': 'vorbis', 'audio_bitrate': 128, 'vcodec': 'vp8'},
|
||||
'44': {'ext': 'webm', 'width': 854, 'height': 480, 'acodec': 'vorbis', 'audio_bitrate': 128, 'vcodec': 'vp8'},
|
||||
'45': {'ext': 'webm', 'width': 1280, 'height': 720, 'acodec': 'vorbis', 'audio_bitrate': 192, 'vcodec': 'vp8'},
|
||||
'46': {'ext': 'webm', 'width': 1920, 'height': 1080, 'acodec': 'vorbis', 'audio_bitrate': 192, 'vcodec': 'vp8'},
|
||||
'59': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'audio_bitrate': 128, 'vcodec': 'h264'},
|
||||
'78': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'audio_bitrate': 128, 'vcodec': 'h264'},
|
||||
|
||||
|
||||
# 3D videos
|
||||
'82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'acodec': 'aac', 'audio_bitrate': 128, 'vcodec': 'h264'},
|
||||
'83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'acodec': 'aac', 'audio_bitrate': 128, 'vcodec': 'h264'},
|
||||
'84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'acodec': 'aac', 'audio_bitrate': 192, 'vcodec': 'h264'},
|
||||
'85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'acodec': 'aac', 'audio_bitrate': 192, 'vcodec': 'h264'},
|
||||
'100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'acodec': 'vorbis', 'audio_bitrate': 128, 'vcodec': 'vp8'},
|
||||
'101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'acodec': 'vorbis', 'audio_bitrate': 192, 'vcodec': 'vp8'},
|
||||
'102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'audio_bitrate': 192, 'vcodec': 'vp8'},
|
||||
|
||||
# Apple HTTP Live Streaming
|
||||
'91': {'ext': 'mp4', 'height': 144, 'format_note': 'HLS', 'acodec': 'aac', 'audio_bitrate': 48, 'vcodec': 'h264'},
|
||||
'92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'audio_bitrate': 48, 'vcodec': 'h264'},
|
||||
'93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'audio_bitrate': 128, 'vcodec': 'h264'},
|
||||
'94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'audio_bitrate': 128, 'vcodec': 'h264'},
|
||||
'95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'acodec': 'aac', 'audio_bitrate': 256, 'vcodec': 'h264'},
|
||||
'96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'acodec': 'aac', 'audio_bitrate': 256, 'vcodec': 'h264'},
|
||||
'132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'audio_bitrate': 48, 'vcodec': 'h264'},
|
||||
'151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'audio_bitrate': 24, 'vcodec': 'h264'},
|
||||
|
||||
# DASH mp4 video
|
||||
'133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264'},
|
||||
'134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'h264'},
|
||||
'135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
|
||||
'136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264'},
|
||||
'137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264'},
|
||||
'138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264'}, # Height can vary (https://github.com/ytdl-org/youtube-dl/issues/4559)
|
||||
'160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264'},
|
||||
'212': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
|
||||
'264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264'},
|
||||
'298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
|
||||
'299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
|
||||
'266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264'},
|
||||
|
||||
# Dash mp4 audio
|
||||
'139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'audio_bitrate': 48, 'container': 'm4a_dash'},
|
||||
'140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'audio_bitrate': 128, 'container': 'm4a_dash'},
|
||||
'141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'audio_bitrate': 256, 'container': 'm4a_dash'},
|
||||
'256': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
|
||||
'258': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
|
||||
'325': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'dtse', 'container': 'm4a_dash'},
|
||||
'328': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'ec-3', 'container': 'm4a_dash'},
|
||||
|
||||
# Dash webm
|
||||
'167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
|
||||
'168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
|
||||
'169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
|
||||
'170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
|
||||
'218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
|
||||
'219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
|
||||
'278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp9'},
|
||||
'242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'vp9'},
|
||||
'243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'vp9'},
|
||||
'244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
|
||||
'245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
|
||||
'246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
|
||||
'247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9'},
|
||||
'248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9'},
|
||||
'271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9'},
|
||||
# itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug)
|
||||
'272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
|
||||
'302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
|
||||
'303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
|
||||
'308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
|
||||
'313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
|
||||
'315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
|
||||
|
||||
# Dash webm audio
|
||||
'171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'audio_bitrate': 128},
|
||||
'172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'audio_bitrate': 256},
|
||||
|
||||
# Dash webm audio with opus inside
|
||||
'249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'audio_bitrate': 50},
|
||||
'250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'audio_bitrate': 70},
|
||||
'251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'audio_bitrate': 160},
|
||||
|
||||
# RTMP (unnamed)
|
||||
'_rtmp': {'protocol': 'rtmp'},
|
||||
|
||||
# av01 video only formats sometimes served with "unknown" codecs
|
||||
'394': {'vcodec': 'av01.0.05M.08'},
|
||||
'395': {'vcodec': 'av01.0.05M.08'},
|
||||
'396': {'vcodec': 'av01.0.05M.08'},
|
||||
'397': {'vcodec': 'av01.0.05M.08'},
|
||||
}
|
||||
|
||||
def _extract_metadata_row_info(video_renderer_info):
|
||||
# extract category and music list
|
||||
info = {
|
||||
'category': None,
|
||||
'music_list': [],
|
||||
}
|
||||
|
||||
current_song = {}
|
||||
for row in deep_get(video_renderer_info, 'metadataRowContainer', 'metadataRowContainerRenderer', 'rows', default=[]):
|
||||
row_title = extract_str(deep_get(row, 'metadataRowRenderer', 'title'), default='')
|
||||
row_content = extract_str(deep_get(row, 'metadataRowRenderer', 'contents', 0))
|
||||
if row_title == 'Category':
|
||||
info['category'] = row_content
|
||||
elif row_title in ('Song', 'Music'):
|
||||
if current_song:
|
||||
info['music_list'].append(current_song)
|
||||
current_song = {'title': row_content}
|
||||
elif row_title == 'Artist':
|
||||
current_song['artist'] = row_content
|
||||
elif row_title == 'Album':
|
||||
current_song['album'] = row_content
|
||||
elif row_title == 'Writers':
|
||||
current_song['writers'] = row_content
|
||||
elif row_title.startswith('Licensed'):
|
||||
current_song['licensor'] = row_content
|
||||
if current_song:
|
||||
info['music_list'].append(current_song)
|
||||
|
||||
return info
|
||||
|
||||
def _extract_watch_info_mobile(top_level):
|
||||
info = {}
|
||||
microformat = deep_get(top_level, 'playerResponse', 'microformat', 'playerMicroformatRenderer', default={})
|
||||
|
||||
family_safe = microformat.get('isFamilySafe')
|
||||
if family_safe is None:
|
||||
info['age_restricted'] = None
|
||||
else:
|
||||
info['age_restricted'] = not family_safe
|
||||
info['allowed_countries'] = microformat.get('availableCountries', [])
|
||||
info['time_published'] = microformat.get('publishDate')
|
||||
|
||||
response = top_level.get('response', {})
|
||||
|
||||
# this renderer has the stuff visible on the page
|
||||
# check for playlist
|
||||
items, _ = extract_items(response,
|
||||
item_types={'singleColumnWatchNextResults'})
|
||||
if items:
|
||||
watch_next_results = items[0]['singleColumnWatchNextResults']
|
||||
playlist = deep_get(watch_next_results, 'playlist', 'playlist')
|
||||
if playlist is None:
|
||||
info['playlist'] = None
|
||||
else:
|
||||
info['playlist'] = {}
|
||||
info['playlist']['title'] = playlist.get('title')
|
||||
info['playlist']['author'] = extract_str(multi_get(playlist,
|
||||
'ownerName', 'longBylineText', 'shortBylineText', 'ownerText'))
|
||||
author_id = deep_get(playlist, 'longBylineText', 'runs', 0,
|
||||
'navigationEndpoint', 'browseEndpoint', 'browseId')
|
||||
info['playlist']['author_id'] = author_id
|
||||
if author_id:
|
||||
info['playlist']['author_url'] = concat_or_none(
|
||||
'https://www.youtube.com/channel/', author_id)
|
||||
info['playlist']['id'] = playlist.get('playlistId')
|
||||
info['playlist']['url'] = concat_or_none(
|
||||
'https://www.youtube.com/playlist?list=',
|
||||
info['playlist']['id'])
|
||||
info['playlist']['video_count'] = playlist.get('totalVideos')
|
||||
info['playlist']['current_index'] = playlist.get('currentIndex')
|
||||
info['playlist']['items'] = [
|
||||
extract_item_info(i) for i in playlist.get('contents', ())]
|
||||
else:
|
||||
info['playlist'] = None
|
||||
|
||||
# Holds the visible video info. It is inside singleColumnWatchNextResults
|
||||
# but use our convenience function instead
|
||||
items, _ = extract_items(response, item_types={'slimVideoMetadataRenderer'})
|
||||
if items:
|
||||
video_info = items[0]['slimVideoMetadataRenderer']
|
||||
else:
|
||||
print('Failed to extract video metadata')
|
||||
video_info = {}
|
||||
|
||||
info.update(_extract_metadata_row_info(video_info))
|
||||
info['description'] = extract_str(video_info.get('description'), recover_urls=True)
|
||||
info['view_count'] = extract_int(extract_str(video_info.get('expandedSubtitle')))
|
||||
info['author'] = extract_str(deep_get(video_info, 'owner', 'slimOwnerRenderer', 'title'))
|
||||
info['author_id'] = deep_get(video_info, 'owner', 'slimOwnerRenderer', 'navigationEndpoint', 'browseEndpoint', 'browseId')
|
||||
info['title'] = extract_str(video_info.get('title'))
|
||||
info['live'] = 'watching' in extract_str(video_info.get('expandedSubtitle'), default='')
|
||||
info['unlisted'] = False
|
||||
for badge in video_info.get('badges', []):
|
||||
if deep_get(badge, 'metadataBadgeRenderer', 'label') == 'Unlisted':
|
||||
info['unlisted'] = True
|
||||
info['like_count'] = None
|
||||
info['dislike_count'] = None
|
||||
if not info['time_published']:
|
||||
info['time_published'] = extract_date(extract_str(video_info.get('dateText', None)))
|
||||
for button in video_info.get('buttons', ()):
|
||||
button_renderer = button.get('slimMetadataToggleButtonRenderer', {})
|
||||
|
||||
# all the digits can be found in the accessibility data
|
||||
count = extract_int(deep_get(button_renderer, 'button', 'toggleButtonRenderer', 'defaultText', 'accessibility', 'accessibilityData', 'label'))
|
||||
|
||||
# this count doesn't have all the digits, it's like 53K for instance
|
||||
dumb_count = extract_int(extract_str(deep_get(button_renderer, 'button', 'toggleButtonRenderer', 'defaultText')))
|
||||
|
||||
# the accessibility text will be "No likes" or "No dislikes" or something like that, but dumb count will be 0
|
||||
if dumb_count == 0:
|
||||
count = 0
|
||||
|
||||
if 'isLike' in button_renderer:
|
||||
info['like_count'] = count
|
||||
elif 'isDislike' in button_renderer:
|
||||
info['dislike_count'] = count
|
||||
|
||||
# comment section info
|
||||
items, _ = extract_items(response, item_types={
|
||||
'commentSectionRenderer', 'commentsEntryPointHeaderRenderer'})
|
||||
if items:
|
||||
header_type = list(items[0])[0]
|
||||
comment_info = items[0][header_type]
|
||||
# This seems to be some kind of A/B test being done on mobile, where
|
||||
# this is present instead of the normal commentSectionRenderer. It can
|
||||
# be seen here:
|
||||
# https://www.androidpolice.com/2019/10/31/google-youtube-app-comment-section-below-videos/
|
||||
# https://www.youtube.com/watch?v=bR5Q-wD-6qo
|
||||
if header_type == 'commentsEntryPointHeaderRenderer':
|
||||
comment_count_text = extract_str(comment_info.get('headerText'))
|
||||
else:
|
||||
comment_count_text = extract_str(deep_get(comment_info,
|
||||
'header', 'commentSectionHeaderRenderer', 'countText'))
|
||||
if comment_count_text == 'Comments': # just this with no number, means 0 comments
|
||||
info['comment_count'] = 0
|
||||
else:
|
||||
info['comment_count'] = extract_int(comment_count_text)
|
||||
info['comments_disabled'] = False
|
||||
else: # no comment section present means comments are disabled
|
||||
info['comment_count'] = 0
|
||||
info['comments_disabled'] = True
|
||||
|
||||
# check for limited state
|
||||
items, _ = extract_items(response, item_types={'limitedStateMessageRenderer'})
|
||||
if items:
|
||||
info['limited_state'] = True
|
||||
else:
|
||||
info['limited_state'] = False
|
||||
|
||||
# related videos
|
||||
related, _ = extract_items(response)
|
||||
info['related_videos'] = [extract_item_info(renderer) for renderer in related]
|
||||
|
||||
return info
|
||||
|
||||
def _extract_watch_info_desktop(top_level):
|
||||
info = {
|
||||
'comment_count': None,
|
||||
'comments_disabled': None,
|
||||
'allowed_countries': [],
|
||||
'limited_state': None,
|
||||
'playlist': None,
|
||||
}
|
||||
|
||||
video_info = {}
|
||||
for renderer in deep_get(top_level, 'response', 'contents', 'twoColumnWatchNextResults', 'results', 'results', 'contents', default=()):
|
||||
if renderer and list(renderer.keys())[0] in ('videoPrimaryInfoRenderer', 'videoSecondaryInfoRenderer'):
|
||||
video_info.update(list(renderer.values())[0])
|
||||
|
||||
info.update(_extract_metadata_row_info(video_info))
|
||||
info['description'] = extract_str(video_info.get('description', None), recover_urls=True)
|
||||
info['time_published'] = extract_date(extract_str(video_info.get('dateText', None)))
|
||||
|
||||
likes_dislikes = deep_get(video_info, 'sentimentBar', 'sentimentBarRenderer', 'tooltip', default='').split('/')
|
||||
if len(likes_dislikes) == 2:
|
||||
info['like_count'] = extract_int(likes_dislikes[0])
|
||||
info['dislike_count'] = extract_int(likes_dislikes[1])
|
||||
else:
|
||||
info['like_count'] = None
|
||||
info['dislike_count'] = None
|
||||
|
||||
info['title'] = extract_str(video_info.get('title', None))
|
||||
info['author'] = extract_str(deep_get(video_info, 'owner', 'videoOwnerRenderer', 'title'))
|
||||
info['author_id'] = deep_get(video_info, 'owner', 'videoOwnerRenderer', 'navigationEndpoint', 'browseEndpoint', 'browseId')
|
||||
info['view_count'] = extract_int(extract_str(deep_get(video_info, 'viewCount', 'videoViewCountRenderer', 'viewCount')))
|
||||
|
||||
related = deep_get(top_level, 'response', 'contents', 'twoColumnWatchNextResults', 'secondaryResults', 'secondaryResults', 'results', default=[])
|
||||
info['related_videos'] = [extract_item_info(renderer) for renderer in related]
|
||||
|
||||
return info
|
||||
|
||||
def update_format_with_codec_info(fmt, codec):
|
||||
if (codec.startswith('av')
|
||||
or codec in ('vp9', 'vp8', 'vp8.0', 'h263', 'h264', 'mp4v')):
|
||||
if codec == 'vp8.0':
|
||||
codec = 'vp8'
|
||||
conservative_update(fmt, 'vcodec', codec)
|
||||
elif (codec.startswith('mp4a')
|
||||
or codec in ('opus', 'mp3', 'aac', 'dtse', 'ec-3', 'vorbis')):
|
||||
conservative_update(fmt, 'acodec', codec)
|
||||
else:
|
||||
print('Warning: unrecognized codec: ' + codec)
|
||||
|
||||
fmt_type_re = re.compile(
|
||||
r'(text|audio|video)/([\w0-9]+); codecs="([\w0-9\.]+(?:, [\w0-9\.]+)*)"')
|
||||
def update_format_with_type_info(fmt, yt_fmt):
|
||||
# 'type' for invidious api format
|
||||
mime_type = multi_get(yt_fmt, 'mimeType', 'type')
|
||||
if mime_type is None:
|
||||
return
|
||||
match = re.fullmatch(fmt_type_re, mime_type)
|
||||
|
||||
type, fmt['ext'], codecs = match.groups()
|
||||
codecs = codecs.split(', ')
|
||||
for codec in codecs:
|
||||
update_format_with_codec_info(fmt, codec)
|
||||
if type == 'audio':
|
||||
assert len(codecs) == 1
|
||||
|
||||
def _extract_formats(info, player_response):
|
||||
streaming_data = player_response.get('streamingData', {})
|
||||
yt_formats = streaming_data.get('formats', []) + streaming_data.get('adaptiveFormats', [])
|
||||
|
||||
info['formats'] = []
|
||||
# because we may retry the extract_formats with a different player_response
|
||||
# so keep what we have
|
||||
conservative_update(info, 'hls_manifest_url',
|
||||
streaming_data.get('hlsManifestUrl'))
|
||||
conservative_update(info, 'dash_manifest_url',
|
||||
streaming_data.get('dash_manifest_url'))
|
||||
|
||||
for yt_fmt in yt_formats:
|
||||
itag = yt_fmt.get('itag')
|
||||
|
||||
fmt = {}
|
||||
fmt['itag'] = itag
|
||||
fmt['ext'] = None
|
||||
fmt['audio_bitrate'] = None
|
||||
fmt['acodec'] = None
|
||||
fmt['vcodec'] = None
|
||||
fmt['width'] = yt_fmt.get('width')
|
||||
fmt['height'] = yt_fmt.get('height')
|
||||
fmt['file_size'] = yt_fmt.get('contentLength')
|
||||
fmt['audio_sample_rate'] = yt_fmt.get('audioSampleRate')
|
||||
fmt['fps'] = yt_fmt.get('fps')
|
||||
update_format_with_type_info(fmt, yt_fmt)
|
||||
cipher = dict(urllib.parse.parse_qsl(multi_get(yt_fmt,
|
||||
'cipher', 'signatureCipher', default='')))
|
||||
if cipher:
|
||||
fmt['url'] = cipher.get('url')
|
||||
else:
|
||||
fmt['url'] = yt_fmt.get('url')
|
||||
fmt['s'] = cipher.get('s')
|
||||
fmt['sp'] = cipher.get('sp')
|
||||
|
||||
# update with information from big table
|
||||
hardcoded_itag_info = _formats.get(str(itag), {})
|
||||
for key, value in hardcoded_itag_info.items():
|
||||
conservative_update(fmt, key, value) # prefer info from Youtube
|
||||
fmt['quality'] = hardcoded_itag_info.get('height')
|
||||
|
||||
info['formats'].append(fmt)
|
||||
|
||||
# get ip address
|
||||
if info['formats']:
|
||||
query_string = (info['formats'][0].get('url') or '?').split('?')[1]
|
||||
info['ip_address'] = deep_get(
|
||||
urllib.parse.parse_qs(query_string), 'ip', 0)
|
||||
else:
|
||||
info['ip_address'] = None
|
||||
|
||||
hls_regex = re.compile(r'[\w_-]+=(?:"[^"]+"|[^",]+),')
|
||||
def extract_hls_formats(hls_manifest):
|
||||
'''returns hls_formats, err'''
|
||||
hls_formats = []
|
||||
try:
|
||||
lines = hls_manifest.splitlines()
|
||||
i = 0
|
||||
while i < len(lines):
|
||||
if lines[i].startswith('#EXT-X-STREAM-INF'):
|
||||
fmt = {'acodec': None, 'vcodec': None, 'height': None,
|
||||
'width': None, 'fps': None, 'audio_bitrate': None,
|
||||
'itag': None, 'file_size': None,
|
||||
'audio_sample_rate': None, 'url': None}
|
||||
properties = lines[i].split(':')[1]
|
||||
properties += ',' # make regex work for last key-value pair
|
||||
|
||||
for pair in hls_regex.findall(properties):
|
||||
key, value = pair.rstrip(',').split('=')
|
||||
if key == 'CODECS':
|
||||
for codec in value.strip('"').split(','):
|
||||
update_format_with_codec_info(fmt, codec)
|
||||
elif key == 'RESOLUTION':
|
||||
fmt['width'], fmt['height'] = map(int, value.split('x'))
|
||||
fmt['resolution'] = value
|
||||
elif key == 'FRAME-RATE':
|
||||
fmt['fps'] = int(value)
|
||||
i += 1
|
||||
fmt['url'] = lines[i]
|
||||
assert fmt['url'].startswith('http')
|
||||
fmt['ext'] = 'm3u8'
|
||||
hls_formats.append(fmt)
|
||||
i += 1
|
||||
except Exception as e:
|
||||
traceback.print_exc()
|
||||
return [], str(e)
|
||||
return hls_formats, None
|
||||
|
||||
|
||||
def _extract_playability_error(info, player_response, error_prefix=''):
|
||||
if info['formats']:
|
||||
info['playability_status'] = None
|
||||
info['playability_error'] = None
|
||||
return
|
||||
|
||||
playability_status = deep_get(player_response, 'playabilityStatus', 'status', default=None)
|
||||
info['playability_status'] = playability_status
|
||||
|
||||
playability_reason = extract_str(multi_deep_get(player_response,
|
||||
['playabilityStatus', 'reason'],
|
||||
['playabilityStatus', 'errorScreen', 'playerErrorMessageRenderer', 'reason'],
|
||||
default='Could not find playability error')
|
||||
)
|
||||
|
||||
if playability_status not in (None, 'OK'):
|
||||
info['playability_error'] = error_prefix + playability_reason
|
||||
elif not info['playability_error']: # do not override
|
||||
info['playability_error'] = error_prefix + 'Unknown playability error'
|
||||
|
||||
SUBTITLE_FORMATS = ('srv1', 'srv2', 'srv3', 'ttml', 'vtt')
|
||||
def extract_watch_info(polymer_json):
|
||||
info = {'playability_error': None, 'error': None}
|
||||
|
||||
if isinstance(polymer_json, dict):
|
||||
top_level = polymer_json
|
||||
elif isinstance(polymer_json, (list, tuple)):
|
||||
top_level = {}
|
||||
for page_part in polymer_json:
|
||||
if not isinstance(page_part, dict):
|
||||
return {'error': 'Invalid page part'}
|
||||
top_level.update(page_part)
|
||||
else:
|
||||
return {'error': 'Invalid top level polymer data'}
|
||||
|
||||
error = check_missing_keys(top_level,
|
||||
['player', 'args'],
|
||||
['player', 'assets', 'js'],
|
||||
['playerResponse'],
|
||||
)
|
||||
if error:
|
||||
info['playability_error'] = error
|
||||
|
||||
player_response = top_level.get('playerResponse', {})
|
||||
|
||||
# usually, only the embedded one has the urls
|
||||
player_args = deep_get(top_level, 'player', 'args', default={})
|
||||
if 'player_response' in player_args:
|
||||
embedded_player_response = json.loads(player_args['player_response'])
|
||||
else:
|
||||
embedded_player_response = {}
|
||||
|
||||
# captions
|
||||
info['automatic_caption_languages'] = []
|
||||
info['manual_caption_languages'] = []
|
||||
info['_manual_caption_language_names'] = {} # language name written in that language, needed in some cases to create the url
|
||||
info['translation_languages'] = []
|
||||
captions_info = player_response.get('captions', {})
|
||||
info['_captions_base_url'] = normalize_url(deep_get(captions_info, 'playerCaptionsRenderer', 'baseUrl'))
|
||||
for caption_track in deep_get(captions_info, 'playerCaptionsTracklistRenderer', 'captionTracks', default=()):
|
||||
lang_code = caption_track.get('languageCode')
|
||||
if not lang_code:
|
||||
continue
|
||||
if caption_track.get('kind') == 'asr':
|
||||
info['automatic_caption_languages'].append(lang_code)
|
||||
else:
|
||||
info['manual_caption_languages'].append(lang_code)
|
||||
base_url = caption_track.get('baseUrl', '')
|
||||
lang_name = deep_get(urllib.parse.parse_qs(urllib.parse.urlparse(base_url).query), 'name', 0)
|
||||
if lang_name:
|
||||
info['_manual_caption_language_names'][lang_code] = lang_name
|
||||
|
||||
for translation_lang_info in deep_get(captions_info, 'playerCaptionsTracklistRenderer', 'translationLanguages', default=()):
|
||||
lang_code = translation_lang_info.get('languageCode')
|
||||
if lang_code:
|
||||
info['translation_languages'].append(lang_code)
|
||||
if translation_lang_info.get('isTranslatable') == False:
|
||||
print('WARNING: Found non-translatable caption language')
|
||||
|
||||
# formats
|
||||
_extract_formats(info, embedded_player_response)
|
||||
if not info['formats']:
|
||||
_extract_formats(info, player_response)
|
||||
|
||||
# playability errors
|
||||
_extract_playability_error(info, player_response)
|
||||
|
||||
# check age-restriction
|
||||
info['age_restricted'] = (info['playability_status'] == 'LOGIN_REQUIRED' and info['playability_error'] and ' age' in info['playability_error'])
|
||||
|
||||
# base_js (for decryption of signatures)
|
||||
info['base_js'] = deep_get(top_level, 'player', 'assets', 'js')
|
||||
if info['base_js']:
|
||||
info['base_js'] = normalize_url(info['base_js'])
|
||||
# must uniquely identify url
|
||||
info['player_name'] = urllib.parse.urlparse(info['base_js']).path
|
||||
else:
|
||||
info['player_name'] = None
|
||||
|
||||
# extract stuff from visible parts of page
|
||||
mobile = 'singleColumnWatchNextResults' in deep_get(top_level, 'response', 'contents', default={})
|
||||
if mobile:
|
||||
info.update(_extract_watch_info_mobile(top_level))
|
||||
else:
|
||||
info.update(_extract_watch_info_desktop(top_level))
|
||||
|
||||
# stuff from videoDetails. Use liberal_update to prioritize info from videoDetails over existing info
|
||||
vd = deep_get(top_level, 'playerResponse', 'videoDetails', default={})
|
||||
liberal_update(info, 'title', extract_str(vd.get('title')))
|
||||
liberal_update(info, 'duration', extract_int(vd.get('lengthSeconds')))
|
||||
liberal_update(info, 'view_count', extract_int(vd.get('viewCount')))
|
||||
# videos with no description have a blank string
|
||||
liberal_update(info, 'description', vd.get('shortDescription'))
|
||||
liberal_update(info, 'id', vd.get('videoId'))
|
||||
liberal_update(info, 'author', vd.get('author'))
|
||||
liberal_update(info, 'author_id', vd.get('channelId'))
|
||||
info['was_live'] = vd.get('isLiveContent')
|
||||
conservative_update(info, 'unlisted', not vd.get('isCrawlable', True)) #isCrawlable is false on limited state videos even if they aren't unlisted
|
||||
liberal_update(info, 'tags', vd.get('keywords', []))
|
||||
|
||||
# fallback stuff from microformat
|
||||
mf = deep_get(top_level, 'playerResponse', 'microformat', 'playerMicroformatRenderer', default={})
|
||||
conservative_update(info, 'title', extract_str(mf.get('title')))
|
||||
conservative_update(info, 'duration', extract_int(mf.get('lengthSeconds')))
|
||||
# this gives the view count for limited state videos
|
||||
conservative_update(info, 'view_count', extract_int(mf.get('viewCount')))
|
||||
conservative_update(info, 'description', extract_str(mf.get('description'), recover_urls=True))
|
||||
conservative_update(info, 'author', mf.get('ownerChannelName'))
|
||||
conservative_update(info, 'author_id', mf.get('externalChannelId'))
|
||||
conservative_update(info, 'live', deep_get(mf, 'liveBroadcastDetails',
|
||||
'isLiveNow'))
|
||||
liberal_update(info, 'unlisted', mf.get('isUnlisted'))
|
||||
liberal_update(info, 'category', mf.get('category'))
|
||||
liberal_update(info, 'time_published', mf.get('publishDate'))
|
||||
liberal_update(info, 'time_uploaded', mf.get('uploadDate'))
|
||||
|
||||
# other stuff
|
||||
info['author_url'] = 'https://www.youtube.com/channel/' + info['author_id'] if info['author_id'] else None
|
||||
return info
|
||||
|
||||
def get_caption_url(info, language, format, automatic=False, translation_language=None):
|
||||
'''Gets the url for captions with the given language and format. If automatic is True, get the automatic captions for that language. If translation_language is given, translate the captions from `language` to `translation_language`. If automatic is true and translation_language is given, the automatic captions will be translated.'''
|
||||
url = info['_captions_base_url']
|
||||
url += '&lang=' + language
|
||||
url += '&fmt=' + format
|
||||
if automatic:
|
||||
url += '&kind=asr'
|
||||
elif language in info['_manual_caption_language_names']:
|
||||
url += '&name=' + urllib.parse.quote(info['_manual_caption_language_names'][language], safe='')
|
||||
|
||||
if translation_language:
|
||||
url += '&tlang=' + translation_language
|
||||
return url
|
||||
|
||||
def update_with_age_restricted_info(info, video_info_page):
|
||||
ERROR_PREFIX = 'Error bypassing age-restriction: '
|
||||
|
||||
video_info = urllib.parse.parse_qs(video_info_page)
|
||||
player_response = deep_get(video_info, 'player_response', 0)
|
||||
if player_response is None:
|
||||
info['playability_error'] = ERROR_PREFIX + 'Could not find player_response in video_info_page'
|
||||
return
|
||||
try:
|
||||
player_response = json.loads(player_response)
|
||||
except json.decoder.JSONDecodeError:
|
||||
traceback.print_exc()
|
||||
info['playability_error'] = ERROR_PREFIX + 'Failed to parse json response'
|
||||
return
|
||||
|
||||
_extract_formats(info, player_response)
|
||||
_extract_playability_error(info, player_response, error_prefix=ERROR_PREFIX)
|
||||
|
||||
def requires_decryption(info):
|
||||
return ('formats' in info) and info['formats'] and info['formats'][0]['s']
|
||||
|
||||
# adapted from youtube-dl and invidious:
|
||||
# https://github.com/omarroth/invidious/blob/master/src/invidious/helpers/signatures.cr
|
||||
decrypt_function_re = re.compile(r'function\(a\)\{(a=a\.split\(""\)[^\}{]+)return a\.join\(""\)\}')
|
||||
op_with_arg_re = re.compile(r'[^\.]+\.([^\(]+)\(a,(\d+)\)')
|
||||
def extract_decryption_function(info, base_js):
|
||||
'''Insert decryption function into info. Return error string if not successful.
|
||||
Decryption function is a list of list[2] of numbers.
|
||||
It is advisable to cache the decryption function (uniquely identified by info['player_name']) so base.js (1 MB) doesn't need to be redownloaded each time'''
|
||||
info['decryption_function'] = None
|
||||
decrypt_function_match = decrypt_function_re.search(base_js)
|
||||
if decrypt_function_match is None:
|
||||
return 'Could not find decryption function in base.js'
|
||||
|
||||
function_body = decrypt_function_match.group(1).split(';')[1:-1]
|
||||
if not function_body:
|
||||
return 'Empty decryption function body'
|
||||
|
||||
var_name = get(function_body[0].split('.'), 0)
|
||||
if var_name is None:
|
||||
return 'Could not find var_name'
|
||||
|
||||
var_body_match = re.search(r'var ' + re.escape(var_name) + r'=\{(.*?)\};', base_js, flags=re.DOTALL)
|
||||
if var_body_match is None:
|
||||
return 'Could not find var_body'
|
||||
|
||||
operations = var_body_match.group(1).replace('\n', '').split('},')
|
||||
if not operations:
|
||||
return 'Did not find any definitions in var_body'
|
||||
operations[-1] = operations[-1][:-1] # remove the trailing '}' since we split by '},' on the others
|
||||
operation_definitions = {}
|
||||
for op in operations:
|
||||
colon_index = op.find(':')
|
||||
opening_brace_index = op.find('{')
|
||||
|
||||
if colon_index == -1 or opening_brace_index == -1:
|
||||
return 'Could not parse operation'
|
||||
op_name = op[:colon_index]
|
||||
op_body = op[opening_brace_index+1:]
|
||||
if op_body == 'a.reverse()':
|
||||
operation_definitions[op_name] = 0
|
||||
elif op_body == 'a.splice(0,b)':
|
||||
operation_definitions[op_name] = 1
|
||||
elif op_body.startswith('var c=a[0]'):
|
||||
operation_definitions[op_name] = 2
|
||||
else:
|
||||
return 'Unknown op_body: ' + op_body
|
||||
|
||||
decryption_function = []
|
||||
for op_with_arg in function_body:
|
||||
match = op_with_arg_re.fullmatch(op_with_arg)
|
||||
if match is None:
|
||||
return 'Could not parse operation with arg'
|
||||
op_name = match.group(1)
|
||||
if op_name not in operation_definitions:
|
||||
return 'Unknown op_name: ' + op_name
|
||||
op_argument = match.group(2)
|
||||
decryption_function.append([operation_definitions[op_name], int(op_argument)])
|
||||
|
||||
info['decryption_function'] = decryption_function
|
||||
return False
|
||||
|
||||
def _operation_2(a, b):
|
||||
c = a[0]
|
||||
a[0] = a[b % len(a)]
|
||||
a[b % len(a)] = c
|
||||
|
||||
def decrypt_signatures(info):
|
||||
'''Applies info['decryption_function'] to decrypt all the signatures. Return err.'''
|
||||
if not info.get('decryption_function'):
|
||||
return 'decryption_function not in info'
|
||||
for format in info['formats']:
|
||||
if not format['s'] or not format['sp'] or not format['url']:
|
||||
print('Warning: s, sp, or url not in format')
|
||||
continue
|
||||
|
||||
a = list(format['s'])
|
||||
for op, argument in info['decryption_function']:
|
||||
if op == 0:
|
||||
a.reverse()
|
||||
elif op == 1:
|
||||
a = a[argument:]
|
||||
else:
|
||||
_operation_2(a, argument)
|
||||
|
||||
signature = ''.join(a)
|
||||
format['url'] += '&' + format['sp'] + '=' + signature
|
||||
return False
|
@ -1,4 +1,4 @@
|
||||
from youtube_data import proto, utils
|
||||
from youtube_data import proto
|
||||
from flask import Markup as mk
|
||||
import requests
|
||||
import base64
|
||||
|
@ -1,130 +0,0 @@
|
||||
from youtube_data import proto
|
||||
import json
|
||||
import base64
|
||||
import urllib
|
||||
import requests
|
||||
import re
|
||||
import bleach
|
||||
from flask import Markup
|
||||
|
||||
URL_ORIGIN = "/https://www.youtube.com"
|
||||
|
||||
def make_comment_ctoken(video_id, sort=0, offset=0, lc='', secret_key=''):
|
||||
video_id = proto.as_bytes(video_id)
|
||||
secret_key = proto.as_bytes(secret_key)
|
||||
|
||||
|
||||
page_info = proto.string(4,video_id) + proto.uint(6, sort)
|
||||
offset_information = proto.nested(4, page_info) + proto.uint(5, offset)
|
||||
if secret_key:
|
||||
offset_information = proto.string(1, secret_key) + offset_information
|
||||
|
||||
page_params = proto.string(2, video_id)
|
||||
if lc:
|
||||
page_params += proto.string(6, proto.percent_b64encode(proto.string(15, lc)))
|
||||
|
||||
result = proto.nested(2, page_params) + proto.uint(3,6) + proto.nested(6, offset_information)
|
||||
return base64.urlsafe_b64encode(result).decode('ascii')
|
||||
|
||||
def comment_replies_ctoken(video_id, comment_id, max_results=500):
|
||||
|
||||
params = proto.string(2, comment_id) + proto.uint(9, max_results)
|
||||
params = proto.nested(3, params)
|
||||
|
||||
result = proto.nested(2, proto.string(2, video_id)) + proto.uint(3,6) + proto.nested(6, params)
|
||||
return base64.urlsafe_b64encode(result).decode('ascii')
|
||||
|
||||
|
||||
|
||||
mobile_headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_1 like Mac OS X) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/10.0 Mobile/14E304 Safari/602.1',
|
||||
'Accept': '*/*',
|
||||
'Accept-Language': 'en-US,en;q=0.5',
|
||||
'X-YouTube-Client-Name': '2',
|
||||
'X-YouTube-Client-Version': '2.20180823',
|
||||
}
|
||||
def request_comments(ctoken, replies=False):
|
||||
if replies: # let's make it use different urls for no reason despite all the data being encoded
|
||||
base_url = "https://m.youtube.com/watch_comment?action_get_comment_replies=1&ctoken="
|
||||
else:
|
||||
base_url = "https://m.youtube.com/watch_comment?action_get_comments=1&ctoken="
|
||||
url = base_url + ctoken.replace("=", "%3D") + "&pbj=1"
|
||||
|
||||
for i in range(0,8): # don't retry more than 8 times
|
||||
content = requests.get(url, headers=mobile_headers).text
|
||||
if content[0:4] == b")]}'": # random closing characters included at beginning of response for some reason
|
||||
content = content[4:]
|
||||
elif content[0:10] == b'\n<!DOCTYPE': # occasionally returns html instead of json for no reason
|
||||
content = b''
|
||||
print("got <!DOCTYPE>, retrying")
|
||||
continue
|
||||
break
|
||||
|
||||
polymer_json = json.loads(content)
|
||||
return polymer_json
|
||||
|
||||
|
||||
def single_comment_ctoken(video_id, comment_id):
|
||||
page_params = proto.string(2, video_id) + proto.string(6, proto.percent_b64encode(proto.string(15, comment_id)))
|
||||
|
||||
result = proto.nested(2, page_params) + proto.uint(3,6)
|
||||
return base64.urlsafe_b64encode(result).decode('ascii')
|
||||
|
||||
|
||||
def concat_texts(strings):
|
||||
'''Concatenates strings. Returns None if any of the arguments are None'''
|
||||
result = ''
|
||||
for string in strings:
|
||||
if string['text'] is None:
|
||||
return None
|
||||
result += string['text']
|
||||
return result
|
||||
|
||||
def parse_comment(raw_comment):
|
||||
cmnt = {}
|
||||
print(raw_comment)
|
||||
raw_comment = raw_comment['commentThreadRenderer']['comment']['commentRenderer']
|
||||
imgHostName = urllib.parse.urlparse(raw_comment['authorThumbnail']['thumbnails'][0]['url']).netloc
|
||||
cmnt['author'] = raw_comment['authorText']['runs'][0]['text']
|
||||
cmnt['thumbnail'] = raw_comment['authorThumbnail']['thumbnails'][0]['url'].replace("https://{}".format(imgHostName), "")+"?host="+imgHostName
|
||||
cmnt['channel'] = raw_comment['authorEndpoint']['commandMetadata']['webCommandMetadata']['url']
|
||||
cmnt['text'] = Markup(bleach.linkify(concat_texts(raw_comment['contentText']['runs']).replace("\n", "<br>")))
|
||||
cmnt['date'] = raw_comment['publishedTimeText']['runs'][0]['text']
|
||||
|
||||
try:
|
||||
cmnt['creatorHeart'] = raw_comment['creatorHeart']['creatorHeartRenderer']['creatorThumbnail']['thumbnails'][0]['url']
|
||||
except:
|
||||
cmnt['creatorHeart'] = False
|
||||
|
||||
try:
|
||||
cmnt['likes'] = raw_comment['likeCount']
|
||||
except:
|
||||
cmnt['likes'] = 0
|
||||
|
||||
try:
|
||||
cmnt['replies'] = raw_comment['replyCount']
|
||||
except:
|
||||
cmnt['replies'] = 0
|
||||
|
||||
cmnt['authorIsChannelOwner'] = raw_comment['authorIsChannelOwner']
|
||||
try:
|
||||
cmnt['pinned'] = raw_comment['pinnedCommentBadge']
|
||||
cmnt['pinned'] = True
|
||||
except:
|
||||
cmnt['pinned'] = False
|
||||
return cmnt
|
||||
|
||||
def post_process_comments_info(comments_info):
|
||||
comments = []
|
||||
for comment in comments_info[1]['response']['continuationContents']['commentSectionContinuation']['items']:
|
||||
comments.append(parse_comment(comment))
|
||||
return comments
|
||||
|
||||
|
||||
|
||||
def video_comments(video_id, sort=0, offset=0, lc='', secret_key=''):
|
||||
comments_info = request_comments(make_comment_ctoken(video_id, sort, offset, lc, secret_key))
|
||||
comments_info = post_process_comments_info(comments_info)
|
||||
return comments_info
|
||||
return {}
|
||||
|
@ -1,5 +1,5 @@
|
||||
from youtube_data import proto, utils
|
||||
from bs4 import BeautifulSoup as bs
|
||||
from youtube_data import proto
|
||||
from youtube import utils
|
||||
from flask import Markup
|
||||
import urllib.parse
|
||||
import requests
|
||||
|
@ -1,12 +0,0 @@
|
||||
def get_description_snippet_text(ds):
|
||||
string = ""
|
||||
for t in ds:
|
||||
try:
|
||||
if t['bold']:
|
||||
text = "<b>"+t['text']+"</b>"
|
||||
else:
|
||||
text = t['text']
|
||||
except:
|
||||
text = t['text']
|
||||
string = string + text
|
||||
return string
|
@ -1,281 +0,0 @@
|
||||
from bs4 import BeautifulSoup as bs
|
||||
from urllib.parse import unquote
|
||||
from youtube_dl import YoutubeDL
|
||||
import urllib.parse
|
||||
import requests
|
||||
import json
|
||||
|
||||
# from https://github.com/ytdl-org/youtube-dl/blob/master/youtube_dl/extractor/youtube.py
|
||||
_formats = {
|
||||
'5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'audio_bitrate': 64, 'vcodec': 'h263'},
|
||||
'6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'audio_bitrate': 64, 'vcodec': 'h263'},
|
||||
'13': {'ext': '3gp', 'acodec': 'aac', 'vcodec': 'mp4v'},
|
||||
'17': {'ext': '3gp', 'width': 176, 'height': 144, 'acodec': 'aac', 'audio_bitrate': 24, 'vcodec': 'mp4v'},
|
||||
'18': {'ext': 'mp4', 'width': 640, 'height': 360, 'acodec': 'aac', 'audio_bitrate': 96, 'vcodec': 'h264'},
|
||||
'22': {'ext': 'mp4', 'width': 1280, 'height': 720, 'acodec': 'aac', 'audio_bitrate': 192, 'vcodec': 'h264'},
|
||||
'34': {'ext': 'flv', 'width': 640, 'height': 360, 'acodec': 'aac', 'audio_bitrate': 128, 'vcodec': 'h264'},
|
||||
'35': {'ext': 'flv', 'width': 854, 'height': 480, 'acodec': 'aac', 'audio_bitrate': 128, 'vcodec': 'h264'},
|
||||
# itag 36 videos are either 320x180 (BaW_jenozKc) or 320x240 (__2ABJjxzNo), audio_bitrate varies as well
|
||||
'36': {'ext': '3gp', 'width': 320, 'acodec': 'aac', 'vcodec': 'mp4v'},
|
||||
'37': {'ext': 'mp4', 'width': 1920, 'height': 1080, 'acodec': 'aac', 'audio_bitrate': 192, 'vcodec': 'h264'},
|
||||
'38': {'ext': 'mp4', 'width': 4096, 'height': 3072, 'acodec': 'aac', 'audio_bitrate': 192, 'vcodec': 'h264'},
|
||||
'43': {'ext': 'webm', 'width': 640, 'height': 360, 'acodec': 'vorbis', 'audio_bitrate': 128, 'vcodec': 'vp8'},
|
||||
'44': {'ext': 'webm', 'width': 854, 'height': 480, 'acodec': 'vorbis', 'audio_bitrate': 128, 'vcodec': 'vp8'},
|
||||
'45': {'ext': 'webm', 'width': 1280, 'height': 720, 'acodec': 'vorbis', 'audio_bitrate': 192, 'vcodec': 'vp8'},
|
||||
'46': {'ext': 'webm', 'width': 1920, 'height': 1080, 'acodec': 'vorbis', 'audio_bitrate': 192, 'vcodec': 'vp8'},
|
||||
'59': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'audio_bitrate': 128, 'vcodec': 'h264'},
|
||||
'78': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'audio_bitrate': 128, 'vcodec': 'h264'},
|
||||
|
||||
|
||||
# 3D videos
|
||||
'82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'acodec': 'aac', 'audio_bitrate': 128, 'vcodec': 'h264'},
|
||||
'83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'acodec': 'aac', 'audio_bitrate': 128, 'vcodec': 'h264'},
|
||||
'84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'acodec': 'aac', 'audio_bitrate': 192, 'vcodec': 'h264'},
|
||||
'85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'acodec': 'aac', 'audio_bitrate': 192, 'vcodec': 'h264'},
|
||||
'100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'acodec': 'vorbis', 'audio_bitrate': 128, 'vcodec': 'vp8'},
|
||||
'101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'acodec': 'vorbis', 'audio_bitrate': 192, 'vcodec': 'vp8'},
|
||||
'102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'audio_bitrate': 192, 'vcodec': 'vp8'},
|
||||
|
||||
# Apple HTTP Live Streaming
|
||||
'91': {'ext': 'mp4', 'height': 144, 'format_note': 'HLS', 'acodec': 'aac', 'audio_bitrate': 48, 'vcodec': 'h264'},
|
||||
'92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'audio_bitrate': 48, 'vcodec': 'h264'},
|
||||
'93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'audio_bitrate': 128, 'vcodec': 'h264'},
|
||||
'94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'audio_bitrate': 128, 'vcodec': 'h264'},
|
||||
'95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'acodec': 'aac', 'audio_bitrate': 256, 'vcodec': 'h264'},
|
||||
'96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'acodec': 'aac', 'audio_bitrate': 256, 'vcodec': 'h264'},
|
||||
'132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'audio_bitrate': 48, 'vcodec': 'h264'},
|
||||
'151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'audio_bitrate': 24, 'vcodec': 'h264'},
|
||||
|
||||
# DASH mp4 video
|
||||
'133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264'},
|
||||
'134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'h264'},
|
||||
'135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
|
||||
'136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264'},
|
||||
'137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264'},
|
||||
'138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264'}, # Height can vary (https://github.com/ytdl-org/youtube-dl/issues/4559)
|
||||
'160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264'},
|
||||
'212': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
|
||||
'264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264'},
|
||||
'298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
|
||||
'299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
|
||||
'266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264'},
|
||||
|
||||
# Dash mp4 audio
|
||||
'139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'audio_bitrate': 48, 'container': 'm4a_dash'},
|
||||
'140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'audio_bitrate': 128, 'container': 'm4a_dash'},
|
||||
'141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'audio_bitrate': 256, 'container': 'm4a_dash'},
|
||||
'256': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
|
||||
'258': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
|
||||
'325': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'dtse', 'container': 'm4a_dash'},
|
||||
'328': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'ec-3', 'container': 'm4a_dash'},
|
||||
|
||||
# Dash webm
|
||||
'167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
|
||||
'168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
|
||||
'169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
|
||||
'170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
|
||||
'218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
|
||||
'219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
|
||||
'278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp9'},
|
||||
'242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'vp9'},
|
||||
'243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'vp9'},
|
||||
'244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
|
||||
'245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
|
||||
'246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
|
||||
'247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9'},
|
||||
'248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9'},
|
||||
'271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9'},
|
||||
# itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug)
|
||||
'272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
|
||||
'302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
|
||||
'303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
|
||||
'308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
|
||||
'313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
|
||||
'315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
|
||||
|
||||
# Dash webm audio
|
||||
'171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'audio_bitrate': 128},
|
||||
'172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'audio_bitrate': 256},
|
||||
|
||||
# Dash webm audio with opus inside
|
||||
'249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'audio_bitrate': 50},
|
||||
'250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'audio_bitrate': 70},
|
||||
'251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'audio_bitrate': 160},
|
||||
|
||||
# RTMP (unnamed)
|
||||
'_rtmp': {'protocol': 'rtmp'},
|
||||
|
||||
# av01 video only formats sometimes served with "unknown" codecs
|
||||
'394': {'vcodec': 'av01.0.05M.08'},
|
||||
'395': {'vcodec': 'av01.0.05M.08'},
|
||||
'396': {'vcodec': 'av01.0.05M.08'},
|
||||
'397': {'vcodec': 'av01.0.05M.08'},
|
||||
}
|
||||
|
||||
|
||||
def get_renderer_key(renderer, key):
|
||||
for k in renderer:
|
||||
if key in k:
|
||||
return k[key]
|
||||
|
||||
def get_video_primary_info(datad, datai):
|
||||
contents = datai["contents"]["twoColumnWatchNextResults"]['results']['results']['contents']
|
||||
item = get_renderer_key(contents, "videoPrimaryInfoRenderer")
|
||||
details = datad['videoDetails']
|
||||
|
||||
# Check if is Livestream
|
||||
if details.get('isLive') and details['lengthSeconds'] == '0':
|
||||
isLive = True
|
||||
else:
|
||||
isLive = False
|
||||
|
||||
# Check if is a Scheduled video
|
||||
if details.get('isUpcoming') == True:
|
||||
isUpcoming = True
|
||||
views = "Scheduled video"
|
||||
premieres = item['dateText']['simpleText']
|
||||
audioURL = False
|
||||
else:
|
||||
isUpcoming = False
|
||||
premieres = False
|
||||
views = details['viewCount']
|
||||
|
||||
ydl = YoutubeDL()
|
||||
|
||||
if isUpcoming == False:
|
||||
data = ydl.extract_info(details['videoId'], False)
|
||||
while not data['formats']:
|
||||
data = ydl.extract_info(details['videoId'], False)
|
||||
formats = data['formats']
|
||||
|
||||
## Get audio
|
||||
audio_urls = []
|
||||
for f in data['formats']:
|
||||
for fid in _formats:
|
||||
if f['format_id'] == fid:
|
||||
try:
|
||||
if 'audio' in _formats[fid]['format_note']:
|
||||
aurl = f['url']
|
||||
fnote = _formats[fid]['format_note']
|
||||
bitrate = _formats[fid]['audio_bitrate']
|
||||
audio_inf = {
|
||||
"url":aurl,
|
||||
"id":fnote,
|
||||
"btr": bitrate
|
||||
}
|
||||
audio_urls.append(audio_inf)
|
||||
except:
|
||||
continue
|
||||
if not isLive:
|
||||
audioURL = audio_urls[-1]['url']
|
||||
else:
|
||||
audioURL = False
|
||||
else: # If it is a scheduled video
|
||||
audio_urls = False
|
||||
formats = False
|
||||
try:
|
||||
primaryInfo = {
|
||||
"id": details['videoId'],
|
||||
"title": details['title'],
|
||||
"description": details['shortDescription'],
|
||||
"views": views,
|
||||
"duration": details['lengthSeconds'],
|
||||
"date": item['dateText']['simpleText'],
|
||||
"rating": details['averageRating'],
|
||||
"author": details['author'],
|
||||
"isPrivate": details['isPrivate'],
|
||||
"isLive": isLive,
|
||||
"isUpcoming": isUpcoming,
|
||||
"url":url,
|
||||
"allowRatings": details['allowRatings'],
|
||||
"urls":formats,
|
||||
"thumbnail": details['thumbnail']['thumbnails'][0]['url'],
|
||||
"audio": audioURL,
|
||||
"premieres": premieres
|
||||
}
|
||||
except:
|
||||
# If error take only most common items
|
||||
primaryInfo = {
|
||||
"id": details['videoId'],
|
||||
"title": details['title'],
|
||||
"description": details['shortDescription'],
|
||||
"views": details['viewCount'],
|
||||
"duration": details['lengthSeconds'],
|
||||
"date": item['dateText']['simpleText'],
|
||||
"rating": details['averageRating'],
|
||||
"author": details['author'],
|
||||
"isPrivate":False,
|
||||
"isLive":isLive,
|
||||
"isUpcoming":isUpcoming,
|
||||
"allowRatings":True,
|
||||
"urls":formats,
|
||||
"thumbnail": details['thumbnail']['thumbnails'][0]['url'],
|
||||
"audio": audioURL,
|
||||
"premieres": premieres
|
||||
}
|
||||
return primaryInfo
|
||||
|
||||
def get_video_owner_info(data):
|
||||
contents = data["contents"]["twoColumnWatchNextResults"]['results']['results']['contents']
|
||||
item = get_renderer_key(contents, "videoSecondaryInfoRenderer")
|
||||
ownerItem = item['owner']['videoOwnerRenderer']
|
||||
|
||||
try:
|
||||
sC = ownerItem['subscriberCountText']['runs'][0]['text']
|
||||
except:
|
||||
sC = "Unknown"
|
||||
ownerInfo = {
|
||||
"thumbnail": ownerItem['thumbnail']['thumbnails'][0]['url'],
|
||||
"username": ownerItem['title']['runs'][0]['text'],
|
||||
"id": ownerItem['title']['runs'][0]['navigationEndpoint']['browseEndpoint']['browseId'],
|
||||
"suscriberCount":sC
|
||||
}
|
||||
return ownerInfo
|
||||
|
||||
def get_video_info(id):
|
||||
headers = {"Accept-Language": "en-US,en;q=0.5"}
|
||||
encoded_search = urllib.parse.quote(id)
|
||||
BASE_URL = "https://youtube.com"
|
||||
|
||||
url = f"{BASE_URL}/watch?v={encoded_search}"
|
||||
response = requests.get(url, headers=headers).text
|
||||
|
||||
while 'window["ytInitialData"]' and 'window["ytInitialData"]' not in response:
|
||||
response = requests.get(url, headers=headers).text
|
||||
|
||||
start = (
|
||||
response.index('window["ytInitialData"]')
|
||||
+ len('window["ytInitialData"]')
|
||||
+ 3
|
||||
)
|
||||
|
||||
start2 = (
|
||||
response.index('window["ytInitialPlayerResponse"]')
|
||||
+ len('window["ytInitialPlayerResponse"]') + 3
|
||||
)
|
||||
|
||||
end1 = response.index("};", start) + 1
|
||||
end2 = response.index("};", start2) + 1
|
||||
jsonIni = response[start:end1]
|
||||
dataInitial = json.loads(jsonIni)
|
||||
|
||||
try:
|
||||
jsonDet = response[start2:end2]
|
||||
dataDetails = json.loads(jsonDet)
|
||||
except:
|
||||
response = requests.get(url, headers=headers).json()
|
||||
jsonDet = response[start2:end2]
|
||||
dataDetails = json.loads(jsonDet)
|
||||
|
||||
|
||||
#title, views, date
|
||||
videoInfo = get_video_primary_info(dataDetails, dataInitial)
|
||||
ownerInfo = get_video_owner_info(dataInitial)
|
||||
|
||||
'''soup = bs(response, "html.parser")
|
||||
soup = str(str(soup.find("div", attrs={"id":"player-wrap"}).find_all("script")).split("ytplayer.config =")[1]).split("url")
|
||||
for url in soup:
|
||||
if "googlevideo" in url:
|
||||
print(unquote(url.replace("\\", "")))'''
|
||||
info = {"video":videoInfo, "owner":ownerInfo}
|
||||
return info
|
Reference in New Issue
Block a user