Merge branch 'dev-indep' of https://github.com/ytorg/yotter into dev-indep
This commit is contained in:
commit
7a71b6914a
575
app/routes.py
575
app/routes.py
File diff suppressed because it is too large
Load Diff
@ -1,7 +1,7 @@
|
|||||||
<div class="comment">
|
<div class="comment">
|
||||||
<a class="avatar" style="width: 32px; height: 32px;"><img src="{{ comment.thumbnail }}"></a>
|
<a class="avatar" style="width: 32px; height: 32px;"><img src="{{ comment.thumbnail }}"></a>
|
||||||
<div class="content">
|
<div class="content">
|
||||||
{% if comment.authorIsChannelOwner %}
|
{% if comment.author == info.author %}
|
||||||
|
|
||||||
<a class="author" style="color: red;" href="{{comment.channel}}"><i class="red user circle icon"></i>{{comment.author}}</a>
|
<a class="author" style="color: red;" href="{{comment.channel}}"><i class="red user circle icon"></i>{{comment.author}}</a>
|
||||||
{% else %}
|
{% else %}
|
||||||
@ -22,9 +22,6 @@
|
|||||||
<i class="thumbs up icon"></i>
|
<i class="thumbs up icon"></i>
|
||||||
{{comment.likes}}
|
{{comment.likes}}
|
||||||
</div>
|
</div>
|
||||||
{%if comment.creatorHeart != false%}
|
|
||||||
<i class="small red heart icon"></i><img class="ui circular image" style="width: 15px; height: 15px;" src="{{comment.creatorHeart}}">
|
|
||||||
{% endif %}
|
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
@ -4,30 +4,30 @@
|
|||||||
{% extends "base.html" %}
|
{% extends "base.html" %}
|
||||||
{% block content %}
|
{% block content %}
|
||||||
<div class="ui text container">
|
<div class="ui text container">
|
||||||
{% if video.nginxUrl == "#" %}
|
{% if info.error != None or info.playability_error != None %}
|
||||||
<div class="ui center aligned text container">
|
<div class="ui center aligned text container">
|
||||||
<div class="ui segment">
|
<div class="ui segment">
|
||||||
<h4 class="ui header">ERROR WITH VIDEO</h4>
|
<h4 class="ui header">ERROR WITH VIDEO</h4>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
{% elif video.isUpcoming %}
|
{% elif info.playability_status != None %}
|
||||||
<div class="ui center aligned text container">
|
<div class="ui center aligned text container">
|
||||||
<div class="ui segment">
|
<div class="ui segment">
|
||||||
<h4 class="ui header">SCHEDULED VIDEO</h4>
|
<h4 class="ui header">SCHEDULED VIDEO</h4>
|
||||||
<h5 class="ui header">{{video.premieres}}</h5>
|
<h5 class="ui header">{{video.premieres}}</h5>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
{% elif video.isLive %}
|
{% elif info.live %}
|
||||||
<div class="video-js-responsive-container vjs-hd">
|
<div class="video-js-responsive-container vjs-hd">
|
||||||
<video-js id=live width="1080" class="video-js vjs-default-skin" controls buffered>
|
<video-js id=live width="1080" class="video-js vjs-default-skin" controls>
|
||||||
<source
|
<source
|
||||||
src="{{urls[0]['url']}}"
|
src="#"
|
||||||
type="application/x-mpegURL">
|
type="application/x-mpegURL">
|
||||||
</video-js>
|
</video-js>
|
||||||
</div>
|
</div>
|
||||||
<div class="ui center aligned text container">
|
<div class="ui center aligned text container">
|
||||||
<div class="ui segment">
|
<div class="ui segment">
|
||||||
<h3 class="ui header">LIVESTREAM VIDEO</h3>
|
<h3 class="ui header"><i class="red small circle icon"></i> LIVESTREAM VIDEO</h3>
|
||||||
<h4 class="ui header">FEATURE AVAILABLE SOON</h4>
|
<h4 class="ui header">FEATURE AVAILABLE SOON</h4>
|
||||||
<h5 class="ui header">Livestreams are under developent and still not supported on Yotter.</h5>
|
<h5 class="ui header">Livestreams are under developent and still not supported on Yotter.</h5>
|
||||||
</div>
|
</div>
|
||||||
@ -41,11 +41,11 @@
|
|||||||
buffered
|
buffered
|
||||||
preload="none">
|
preload="none">
|
||||||
{% if config.nginxVideoStream %}
|
{% if config.nginxVideoStream %}
|
||||||
{% for url in urls %}
|
{% for format in info.formats %}
|
||||||
<source src="{{url.url}}" type="video/{{url.ext}}">
|
{% if format.video_valid %}
|
||||||
|
<source src="{{format.url}}" type="video/{{format.ext}}">
|
||||||
|
{% endif %}
|
||||||
{% endfor %}
|
{% endfor %}
|
||||||
{% else %}
|
|
||||||
<source src="{{url_for('stream', url=video.videoUrl.replace('/', 'YotterSlash'))}}" type="video/mp4">
|
|
||||||
{% endif %}
|
{% endif %}
|
||||||
</video>
|
</video>
|
||||||
</div>
|
</div>
|
||||||
@ -53,55 +53,54 @@
|
|||||||
|
|
||||||
<div class="ui segments">
|
<div class="ui segments">
|
||||||
<div class="ui segment">
|
<div class="ui segment">
|
||||||
<h2 class="ui header break-word">{{video.title}}</h2>
|
<h2 class="ui header break-word">{{info.title}}</h2>
|
||||||
</div>
|
</div>
|
||||||
<div class="ui horizontal segments">
|
<div class="ui horizontal segments">
|
||||||
<div class="center aligned ui segment">
|
<div class="center aligned ui segment">
|
||||||
<a href="{{ url_for('channel', id=video.channelId)}}">
|
<a href="{{ url_for('channel', id=info.author_id)}}">
|
||||||
{%if video.author.__len__() > 8%}
|
<i class="user icon"></i> {{info.author}}
|
||||||
<i class="user icon"></i> {{video.author[0:10]+'...'}}
|
|
||||||
{%else%}
|
|
||||||
<i class="user icon"></i> {{video.author}}
|
|
||||||
{%endif%}
|
|
||||||
</a>
|
</a>
|
||||||
</div>
|
</div>
|
||||||
<div class="center aligned ui segment">
|
<div class="center aligned ui segment">
|
||||||
<h4 class="ui header"><i class="grey eye icon"></i>{{video.viewCount}}</h4>
|
<h4 class="ui header"><i class="grey eye icon"></i>{{info.view_count}}</h4>
|
||||||
</div>
|
</div>
|
||||||
<div class="center aligned ui segment">
|
<div class="center aligned ui segment">
|
||||||
{% if video.averageRating | int > 49 %}
|
{% if info.rating | int > 49 %}
|
||||||
<h4 class="ui header"><i class="green thumbs up icon"></i> {{video.averageRating[0:4]}}%</h4>
|
<h4 class="ui header"><i class="green thumbs up icon"></i> {{info.rating}}%</h4>
|
||||||
{% else %}
|
{% else %}
|
||||||
<h4 class="ui header"><i class="red thumbs down icon"></i> {{video.averageRating[0:4]}}%</h4>
|
<h4 class="ui header"><i class="red thumbs down icon"></i> {{info.rating}}%</h4>
|
||||||
{% endif %}
|
{% endif %}
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<div class="ui raised center aligned segment break-word">
|
<div class="ui raised center aligned segment break-word">
|
||||||
<p><i class="grey music icon"></i><b><a href="{{video.nginxAudioUrl}}">Play Only Audio</a></b></p>
|
<p><i class="grey music icon"></i><b>Audio Only</b></p>
|
||||||
<audio controls>
|
<audio controls>
|
||||||
<source src="{{video.nginxAudioUrl}}">
|
{% for format in info.formats %}
|
||||||
Your browser does not support the audio element.
|
{% if format.audio_valid %}
|
||||||
|
<source src="{{format.url}}">
|
||||||
|
{%endif%}
|
||||||
|
{%endfor%}
|
||||||
|
No audio available.
|
||||||
</audio>
|
</audio>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<div class="ui raised segment break-word">
|
<div class="ui raised segment break-word">
|
||||||
<p>{{video.description}}</p>
|
<p>{{info.description}}</p>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
{% if comments != False %}
|
|
||||||
<div class="ui comments">
|
<div class="ui comments">
|
||||||
<h3 class="ui dividing header">Comments</h3>
|
<h3 class="ui dividing header">Comments</h3>
|
||||||
{% for comment in video.comments %}
|
{% for comment in videocomments %}
|
||||||
{% include '_video_comment.html' %}
|
{% include '_video_comment.html' %}
|
||||||
{% endfor %}
|
{% endfor %}
|
||||||
</div>
|
</div>
|
||||||
{%endif%}
|
|
||||||
|
|
||||||
<script src="{{ url_for('static',filename='video.min.js') }}"></script>
|
<script src="{{ url_for('static',filename='video.min.js') }}"></script>
|
||||||
<script src="{{ url_for('static',filename='videojs-http-streaming.min.js')}}"></script>
|
{% if info.live %}
|
||||||
{% if video.isLive %}
|
<p>Active</p>
|
||||||
|
<script src="{{ url_for('static',filename='videojs-http-streaming.min.js')}}"></script>
|
||||||
<script>
|
<script>
|
||||||
var player = videojs('live');
|
var player = videojs('live');
|
||||||
player.play();
|
player.play();
|
||||||
|
281
youtube/channel.py
Normal file
281
youtube/channel.py
Normal file
@ -0,0 +1,281 @@
|
|||||||
|
import base64
|
||||||
|
from youtube import util, yt_data_extract, local_playlist, subscriptions
|
||||||
|
from youtube import yt_app
|
||||||
|
|
||||||
|
import urllib
|
||||||
|
import json
|
||||||
|
from string import Template
|
||||||
|
import youtube.proto as proto
|
||||||
|
import html
|
||||||
|
import math
|
||||||
|
import gevent
|
||||||
|
import re
|
||||||
|
import cachetools.func
|
||||||
|
import traceback
|
||||||
|
|
||||||
|
import flask
|
||||||
|
from flask import request
|
||||||
|
|
||||||
|
headers_desktop = (
|
||||||
|
('Accept', '*/*'),
|
||||||
|
('Accept-Language', 'en-US,en;q=0.5'),
|
||||||
|
('X-YouTube-Client-Name', '1'),
|
||||||
|
('X-YouTube-Client-Version', '2.20180830'),
|
||||||
|
) + util.desktop_ua
|
||||||
|
headers_mobile = (
|
||||||
|
('Accept', '*/*'),
|
||||||
|
('Accept-Language', 'en-US,en;q=0.5'),
|
||||||
|
('X-YouTube-Client-Name', '2'),
|
||||||
|
('X-YouTube-Client-Version', '2.20180830'),
|
||||||
|
) + util.mobile_ua
|
||||||
|
real_cookie = (('Cookie', 'VISITOR_INFO1_LIVE=8XihrAcN1l4'),)
|
||||||
|
generic_cookie = (('Cookie', 'VISITOR_INFO1_LIVE=ST1Ti53r4fU'),)
|
||||||
|
|
||||||
|
# SORT:
|
||||||
|
# videos:
|
||||||
|
# Popular - 1
|
||||||
|
# Oldest - 2
|
||||||
|
# Newest - 3
|
||||||
|
# playlists:
|
||||||
|
# Oldest - 2
|
||||||
|
# Newest - 3
|
||||||
|
# Last video added - 4
|
||||||
|
|
||||||
|
# view:
|
||||||
|
# grid: 0 or 1
|
||||||
|
# list: 2
|
||||||
|
def channel_ctoken_v3(channel_id, page, sort, tab, view=1):
|
||||||
|
# page > 1 doesn't work when sorting by oldest
|
||||||
|
offset = 30*(int(page) - 1)
|
||||||
|
page_token = proto.string(61, proto.unpadded_b64encode(
|
||||||
|
proto.string(1, proto.unpadded_b64encode(proto.uint(1,offset)))
|
||||||
|
))
|
||||||
|
|
||||||
|
tab = proto.string(2, tab )
|
||||||
|
sort = proto.uint(3, int(sort))
|
||||||
|
|
||||||
|
shelf_view = proto.uint(4, 0)
|
||||||
|
view = proto.uint(6, int(view))
|
||||||
|
continuation_info = proto.string(3,
|
||||||
|
proto.percent_b64encode(tab + sort + shelf_view + view + page_token)
|
||||||
|
)
|
||||||
|
|
||||||
|
channel_id = proto.string(2, channel_id )
|
||||||
|
pointless_nest = proto.string(80226972, channel_id + continuation_info)
|
||||||
|
|
||||||
|
return base64.urlsafe_b64encode(pointless_nest).decode('ascii')
|
||||||
|
|
||||||
|
def channel_ctoken_v2(channel_id, page, sort, tab, view=1):
|
||||||
|
# see https://github.com/iv-org/invidious/issues/1319#issuecomment-671732646
|
||||||
|
# page > 1 doesn't work when sorting by oldest
|
||||||
|
offset = 30*(int(page) - 1)
|
||||||
|
schema_number = {
|
||||||
|
3: 6307666885028338688,
|
||||||
|
2: 17254859483345278706,
|
||||||
|
1: 16570086088270825023,
|
||||||
|
}[int(sort)]
|
||||||
|
page_token = proto.string(61, proto.unpadded_b64encode(proto.string(1,
|
||||||
|
proto.uint(1, schema_number) + proto.string(2,
|
||||||
|
proto.string(1, proto.unpadded_b64encode(proto.uint(1,offset)))
|
||||||
|
)
|
||||||
|
)))
|
||||||
|
|
||||||
|
tab = proto.string(2, tab )
|
||||||
|
sort = proto.uint(3, int(sort))
|
||||||
|
#page = proto.string(15, str(page) )
|
||||||
|
|
||||||
|
shelf_view = proto.uint(4, 0)
|
||||||
|
view = proto.uint(6, int(view))
|
||||||
|
continuation_info = proto.string(3,
|
||||||
|
proto.percent_b64encode(tab + sort + shelf_view + view + page_token)
|
||||||
|
)
|
||||||
|
|
||||||
|
channel_id = proto.string(2, channel_id )
|
||||||
|
pointless_nest = proto.string(80226972, channel_id + continuation_info)
|
||||||
|
|
||||||
|
return base64.urlsafe_b64encode(pointless_nest).decode('ascii')
|
||||||
|
|
||||||
|
def channel_ctoken_v1(channel_id, page, sort, tab, view=1):
|
||||||
|
tab = proto.string(2, tab )
|
||||||
|
sort = proto.uint(3, int(sort))
|
||||||
|
page = proto.string(15, str(page) )
|
||||||
|
# example with shelves in videos tab: https://www.youtube.com/channel/UCNL1ZadSjHpjm4q9j2sVtOA/videos
|
||||||
|
shelf_view = proto.uint(4, 0)
|
||||||
|
view = proto.uint(6, int(view))
|
||||||
|
continuation_info = proto.string(3, proto.percent_b64encode(tab + view + sort + shelf_view + page + proto.uint(23, 0)) )
|
||||||
|
|
||||||
|
channel_id = proto.string(2, channel_id )
|
||||||
|
pointless_nest = proto.string(80226972, channel_id + continuation_info)
|
||||||
|
|
||||||
|
return base64.urlsafe_b64encode(pointless_nest).decode('ascii')
|
||||||
|
|
||||||
|
def get_channel_tab(channel_id, page="1", sort=3, tab='videos', view=1, print_status=True):
|
||||||
|
message = 'Got channel tab' if print_status else None
|
||||||
|
|
||||||
|
if int(sort) == 2 and int(page) > 1:
|
||||||
|
ctoken = channel_ctoken_v1(channel_id, page, sort, tab, view)
|
||||||
|
ctoken = ctoken.replace('=', '%3D')
|
||||||
|
url = ('https://www.youtube.com/channel/' + channel_id + '/' + tab
|
||||||
|
+ '?action_continuation=1&continuation=' + ctoken
|
||||||
|
+ '&pbj=1')
|
||||||
|
content = util.fetch_url(url, headers_desktop + real_cookie,
|
||||||
|
debug_name='channel_tab', report_text=message)
|
||||||
|
else:
|
||||||
|
ctoken = channel_ctoken_v3(channel_id, page, sort, tab, view)
|
||||||
|
ctoken = ctoken.replace('=', '%3D')
|
||||||
|
url = 'https://www.youtube.com/browse_ajax?ctoken=' + ctoken
|
||||||
|
content = util.fetch_url(url,
|
||||||
|
headers_desktop + generic_cookie,
|
||||||
|
debug_name='channel_tab', report_text=message)
|
||||||
|
|
||||||
|
return content
|
||||||
|
|
||||||
|
# cache entries expire after 30 minutes
|
||||||
|
@cachetools.func.ttl_cache(maxsize=128, ttl=30*60)
|
||||||
|
def get_number_of_videos_channel(channel_id):
|
||||||
|
if channel_id is None:
|
||||||
|
return 1000
|
||||||
|
|
||||||
|
# Uploads playlist
|
||||||
|
playlist_id = 'UU' + channel_id[2:]
|
||||||
|
url = 'https://m.youtube.com/playlist?list=' + playlist_id + '&pbj=1'
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = util.fetch_url(url, headers_mobile,
|
||||||
|
debug_name='number_of_videos', report_text='Got number of videos')
|
||||||
|
except urllib.error.HTTPError as e:
|
||||||
|
traceback.print_exc()
|
||||||
|
print("Couldn't retrieve number of videos")
|
||||||
|
return 1000
|
||||||
|
|
||||||
|
response = response.decode('utf-8')
|
||||||
|
|
||||||
|
# match = re.search(r'"numVideosText":\s*{\s*"runs":\s*\[{"text":\s*"([\d,]*) videos"', response)
|
||||||
|
match = re.search(r'"numVideosText".*?([,\d]+)', response)
|
||||||
|
if match:
|
||||||
|
return int(match.group(1).replace(',',''))
|
||||||
|
else:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
channel_id_re = re.compile(r'videos\.xml\?channel_id=([a-zA-Z0-9_-]{24})"')
|
||||||
|
@cachetools.func.lru_cache(maxsize=128)
|
||||||
|
def get_channel_id(base_url):
|
||||||
|
# method that gives the smallest possible response at ~4 kb
|
||||||
|
# needs to be as fast as possible
|
||||||
|
base_url = base_url.replace('https://www', 'https://m') # avoid redirect
|
||||||
|
response = util.fetch_url(base_url + '/about?pbj=1', headers_mobile,
|
||||||
|
debug_name='get_channel_id', report_text='Got channel id').decode('utf-8')
|
||||||
|
match = channel_id_re.search(response)
|
||||||
|
if match:
|
||||||
|
return match.group(1)
|
||||||
|
return None
|
||||||
|
|
||||||
|
def get_number_of_videos_general(base_url):
|
||||||
|
return get_number_of_videos_channel(get_channel_id(base_url))
|
||||||
|
|
||||||
|
def get_channel_search_json(channel_id, query, page):
|
||||||
|
params = proto.string(2, 'search') + proto.string(15, str(page))
|
||||||
|
params = proto.percent_b64encode(params)
|
||||||
|
ctoken = proto.string(2, channel_id) + proto.string(3, params) + proto.string(11, query)
|
||||||
|
ctoken = base64.urlsafe_b64encode(proto.nested(80226972, ctoken)).decode('ascii')
|
||||||
|
|
||||||
|
polymer_json = util.fetch_url("https://www.youtube.com/browse_ajax?ctoken=" + ctoken, headers_desktop, debug_name='channel_search')
|
||||||
|
|
||||||
|
return polymer_json
|
||||||
|
|
||||||
|
|
||||||
|
def post_process_channel_info(info):
|
||||||
|
info['avatar'] = util.prefix_url(info['avatar'])
|
||||||
|
info['channel_url'] = util.prefix_url(info['channel_url'])
|
||||||
|
for item in info['items']:
|
||||||
|
util.prefix_urls(item)
|
||||||
|
util.add_extra_html_info(item)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
playlist_sort_codes = {'2': "da", '3': "dd", '4': "lad"}
|
||||||
|
|
||||||
|
# youtube.com/[channel_id]/[tab]
|
||||||
|
# youtube.com/user/[username]/[tab]
|
||||||
|
# youtube.com/c/[custom]/[tab]
|
||||||
|
# youtube.com/[custom]/[tab]
|
||||||
|
def get_channel_page_general_url(base_url, tab, request, channel_id=None):
|
||||||
|
|
||||||
|
page_number = int(request.args.get('page', 1))
|
||||||
|
sort = request.args.get('sort', '3')
|
||||||
|
view = request.args.get('view', '1')
|
||||||
|
query = request.args.get('query', '')
|
||||||
|
|
||||||
|
if tab == 'videos' and channel_id:
|
||||||
|
tasks = (
|
||||||
|
gevent.spawn(get_number_of_videos_channel, channel_id),
|
||||||
|
gevent.spawn(get_channel_tab, channel_id, page_number, sort, 'videos', view)
|
||||||
|
)
|
||||||
|
gevent.joinall(tasks)
|
||||||
|
util.check_gevent_exceptions(*tasks)
|
||||||
|
number_of_videos, polymer_json = tasks[0].value, tasks[1].value
|
||||||
|
elif tab == 'videos':
|
||||||
|
tasks = (
|
||||||
|
gevent.spawn(get_number_of_videos_general, base_url),
|
||||||
|
gevent.spawn(util.fetch_url, base_url + '/videos?pbj=1&view=0', headers_desktop, debug_name='gen_channel_videos')
|
||||||
|
)
|
||||||
|
gevent.joinall(tasks)
|
||||||
|
util.check_gevent_exceptions(*tasks)
|
||||||
|
number_of_videos, polymer_json = tasks[0].value, tasks[1].value
|
||||||
|
elif tab == 'about':
|
||||||
|
polymer_json = util.fetch_url(base_url + '/about?pbj=1', headers_desktop, debug_name='gen_channel_about')
|
||||||
|
elif tab == 'playlists':
|
||||||
|
polymer_json = util.fetch_url(base_url+ '/playlists?pbj=1&view=1&sort=' + playlist_sort_codes[sort], headers_desktop, debug_name='gen_channel_playlists')
|
||||||
|
elif tab == 'search' and channel_id:
|
||||||
|
polymer_json = get_channel_search_json(channel_id, query, page_number)
|
||||||
|
elif tab == 'search':
|
||||||
|
url = base_url + '/search?pbj=1&query=' + urllib.parse.quote(query, safe='')
|
||||||
|
polymer_json = util.fetch_url(url, headers_desktop, debug_name='gen_channel_search')
|
||||||
|
else:
|
||||||
|
flask.abort(404, 'Unknown channel tab: ' + tab)
|
||||||
|
|
||||||
|
|
||||||
|
info = yt_data_extract.extract_channel_info(json.loads(polymer_json), tab)
|
||||||
|
if info['error'] is not None:
|
||||||
|
return flask.render_template('error.html', error_message = info['error'])
|
||||||
|
|
||||||
|
post_process_channel_info(info)
|
||||||
|
if tab == 'videos':
|
||||||
|
info['number_of_videos'] = number_of_videos
|
||||||
|
info['number_of_pages'] = math.ceil(number_of_videos/30)
|
||||||
|
info['header_playlist_names'] = local_playlist.get_playlist_names()
|
||||||
|
if tab in ('videos', 'playlists'):
|
||||||
|
info['current_sort'] = sort
|
||||||
|
elif tab == 'search':
|
||||||
|
info['search_box_value'] = query
|
||||||
|
info['header_playlist_names'] = local_playlist.get_playlist_names()
|
||||||
|
info['page_number'] = page_number
|
||||||
|
info['subscribed'] = subscriptions.is_subscribed(info['channel_id'])
|
||||||
|
|
||||||
|
return flask.render_template('channel.html',
|
||||||
|
parameters_dictionary = request.args,
|
||||||
|
**info
|
||||||
|
)
|
||||||
|
|
||||||
|
@yt_app.route('/channel/<channel_id>/')
|
||||||
|
@yt_app.route('/channel/<channel_id>/<tab>')
|
||||||
|
def get_channel_page(channel_id, tab='videos'):
|
||||||
|
return get_channel_page_general_url('https://www.youtube.com/channel/' + channel_id, tab, request, channel_id)
|
||||||
|
|
||||||
|
@yt_app.route('/user/<username>/')
|
||||||
|
@yt_app.route('/user/<username>/<tab>')
|
||||||
|
def get_user_page(username, tab='videos'):
|
||||||
|
return get_channel_page_general_url('https://www.youtube.com/user/' + username, tab, request)
|
||||||
|
|
||||||
|
@yt_app.route('/c/<custom>/')
|
||||||
|
@yt_app.route('/c/<custom>/<tab>')
|
||||||
|
def get_custom_c_page(custom, tab='videos'):
|
||||||
|
return get_channel_page_general_url('https://www.youtube.com/c/' + custom, tab, request)
|
||||||
|
|
||||||
|
@yt_app.route('/<custom>')
|
||||||
|
@yt_app.route('/<custom>/<tab>')
|
||||||
|
def get_toplevel_custom_page(custom, tab='videos'):
|
||||||
|
return get_channel_page_general_url('https://www.youtube.com/' + custom, tab, request)
|
||||||
|
|
145
youtube/comments.py
Normal file
145
youtube/comments.py
Normal file
@ -0,0 +1,145 @@
|
|||||||
|
import base64
|
||||||
|
import json
|
||||||
|
|
||||||
|
from youtube import proto, util, yt_data_extract
|
||||||
|
from youtube.util import concat_or_none
|
||||||
|
|
||||||
|
|
||||||
|
# Here's what I know about the secret key (starting with ASJN_i)
|
||||||
|
# *The secret key definitely contains the following information (or perhaps the information is stored at youtube's servers):
|
||||||
|
# -Video id
|
||||||
|
# -Offset
|
||||||
|
# -Sort
|
||||||
|
# *If the video id or sort in the ctoken contradicts the ASJN, the response is an error. The offset encoded outside the ASJN is ignored entirely.
|
||||||
|
# *The ASJN is base64 encoded data, indicated by the fact that the character after "ASJN_i" is one of ("0", "1", "2", "3")
|
||||||
|
# *The encoded data is not valid protobuf
|
||||||
|
# *The encoded data (after the 5 or so bytes that are always the same) is indistinguishable from random data according to a battery of randomness tests
|
||||||
|
# *The ASJN in the ctoken provided by a response changes in regular intervals of about a second or two.
|
||||||
|
# *Old ASJN's continue to work, and start at the same comment even if new comments have been posted since
|
||||||
|
# *The ASJN has no relation with any of the data in the response it came from
|
||||||
|
|
||||||
|
def make_comment_ctoken(video_id, sort=0, offset=0, lc='', secret_key=''):
|
||||||
|
video_id = proto.as_bytes(video_id)
|
||||||
|
secret_key = proto.as_bytes(secret_key)
|
||||||
|
|
||||||
|
|
||||||
|
page_info = proto.string(4,video_id) + proto.uint(6, sort)
|
||||||
|
offset_information = proto.nested(4, page_info) + proto.uint(5, offset)
|
||||||
|
if secret_key:
|
||||||
|
offset_information = proto.string(1, secret_key) + offset_information
|
||||||
|
|
||||||
|
page_params = proto.string(2, video_id)
|
||||||
|
if lc:
|
||||||
|
page_params += proto.string(6, proto.percent_b64encode(proto.string(15, lc)))
|
||||||
|
|
||||||
|
result = proto.nested(2, page_params) + proto.uint(3,6) + proto.nested(6, offset_information)
|
||||||
|
return base64.urlsafe_b64encode(result).decode('ascii')
|
||||||
|
|
||||||
|
def comment_replies_ctoken(video_id, comment_id, max_results=500):
|
||||||
|
|
||||||
|
params = proto.string(2, comment_id) + proto.uint(9, max_results)
|
||||||
|
params = proto.nested(3, params)
|
||||||
|
|
||||||
|
result = proto.nested(2, proto.string(2, video_id)) + proto.uint(3,6) + proto.nested(6, params)
|
||||||
|
return base64.urlsafe_b64encode(result).decode('ascii')
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
mobile_headers = {
|
||||||
|
'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_1 like Mac OS X) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/10.0 Mobile/14E304 Safari/602.1',
|
||||||
|
'Accept': '*/*',
|
||||||
|
'Accept-Language': 'en-US,en;q=0.5',
|
||||||
|
'X-YouTube-Client-Name': '2',
|
||||||
|
'X-YouTube-Client-Version': '2.20180823',
|
||||||
|
}
|
||||||
|
def request_comments(ctoken, replies=False):
|
||||||
|
if replies: # let's make it use different urls for no reason despite all the data being encoded
|
||||||
|
base_url = "https://m.youtube.com/watch_comment?action_get_comment_replies=1&ctoken="
|
||||||
|
else:
|
||||||
|
base_url = "https://m.youtube.com/watch_comment?action_get_comments=1&ctoken="
|
||||||
|
url = base_url + ctoken.replace("=", "%3D") + "&pbj=1"
|
||||||
|
|
||||||
|
for i in range(0,8): # don't retry more than 8 times
|
||||||
|
content = util.fetch_url(url, headers=mobile_headers, report_text="Retrieved comments", debug_name='request_comments')
|
||||||
|
if content[0:4] == b")]}'": # random closing characters included at beginning of response for some reason
|
||||||
|
content = content[4:]
|
||||||
|
elif content[0:10] == b'\n<!DOCTYPE': # occasionally returns html instead of json for no reason
|
||||||
|
content = b''
|
||||||
|
print("got <!DOCTYPE>, retrying")
|
||||||
|
continue
|
||||||
|
break
|
||||||
|
|
||||||
|
polymer_json = json.loads(util.uppercase_escape(content.decode('utf-8')))
|
||||||
|
return polymer_json
|
||||||
|
|
||||||
|
|
||||||
|
def single_comment_ctoken(video_id, comment_id):
|
||||||
|
page_params = proto.string(2, video_id) + proto.string(6, proto.percent_b64encode(proto.string(15, comment_id)))
|
||||||
|
|
||||||
|
result = proto.nested(2, page_params) + proto.uint(3,6)
|
||||||
|
return base64.urlsafe_b64encode(result).decode('ascii')
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def post_process_comments_info(comments_info):
|
||||||
|
for comment in comments_info['comments']:
|
||||||
|
comment['author_url'] = concat_or_none(
|
||||||
|
util.URL_ORIGIN, comment['author_url'])
|
||||||
|
comment['author_avatar'] = concat_or_none(
|
||||||
|
'/', comment['author_avatar'])
|
||||||
|
|
||||||
|
comment['permalink'] = concat_or_none(util.URL_ORIGIN, '/watch?v=',
|
||||||
|
comments_info['video_id'], '&lc=', comment['id'])
|
||||||
|
|
||||||
|
reply_count = comment['reply_count']
|
||||||
|
if reply_count == 0:
|
||||||
|
comment['replies_url'] = concat_or_none(util.URL_ORIGIN,
|
||||||
|
'/post_comment?parent_id=', comment['id'],
|
||||||
|
'&video_id=', comments_info['video_id'])
|
||||||
|
else:
|
||||||
|
comment['replies_url'] = concat_or_none(util.URL_ORIGIN,
|
||||||
|
'/comments?parent_id=', comment['id'],
|
||||||
|
'&video_id=', comments_info['video_id'])
|
||||||
|
|
||||||
|
if reply_count == 0:
|
||||||
|
comment['view_replies_text'] = 'Reply'
|
||||||
|
elif reply_count == 1:
|
||||||
|
comment['view_replies_text'] = '1 reply'
|
||||||
|
else:
|
||||||
|
comment['view_replies_text'] = str(reply_count) + ' replies'
|
||||||
|
|
||||||
|
|
||||||
|
if comment['like_count'] == 1:
|
||||||
|
comment['likes_text'] = '1 like'
|
||||||
|
else:
|
||||||
|
comment['likes_text'] = str(comment['like_count']) + ' likes'
|
||||||
|
|
||||||
|
|
||||||
|
if comments_info['ctoken']:
|
||||||
|
comments_info['more_comments_url'] = concat_or_none(util.URL_ORIGIN,
|
||||||
|
'/comments?ctoken=', comments_info['ctoken'])
|
||||||
|
|
||||||
|
comments_info['page_number'] = page_number = str(int(comments_info['offset']/20) + 1)
|
||||||
|
|
||||||
|
if not comments_info['is_replies']:
|
||||||
|
comments_info['sort_text'] = 'top' if comments_info['sort'] == 0 else 'newest'
|
||||||
|
|
||||||
|
|
||||||
|
comments_info['video_url'] = concat_or_none(util.URL_ORIGIN,
|
||||||
|
'/watch?v=', comments_info['video_id'])
|
||||||
|
comments_info['video_thumbnail'] = concat_or_none('/i.ytimg.com/vi/',
|
||||||
|
comments_info['video_id'], '/mqdefault.jpg')
|
||||||
|
|
||||||
|
|
||||||
|
def video_comments(video_id, sort=0, offset=0, lc='', secret_key=''):
|
||||||
|
comments_info = yt_data_extract.extract_comments_info(request_comments(make_comment_ctoken(video_id, sort, offset, lc, secret_key)))
|
||||||
|
post_process_comments_info(comments_info)
|
||||||
|
|
||||||
|
post_comment_url = util.URL_ORIGIN + "/post_comment?video_id=" + video_id
|
||||||
|
other_sort_url = util.URL_ORIGIN + '/comments?ctoken=' + make_comment_ctoken(video_id, sort=1 - sort, lc=lc)
|
||||||
|
other_sort_text = 'Sort by ' + ('newest' if sort == 0 else 'top')
|
||||||
|
comments_info['comment_links'] = [('Post comment', post_comment_url), (other_sort_text, other_sort_url)]
|
||||||
|
|
||||||
|
return comments_info
|
||||||
|
|
||||||
|
return {}
|
11
youtube/opensearch.xml
Normal file
11
youtube/opensearch.xml
Normal file
@ -0,0 +1,11 @@
|
|||||||
|
<SearchPlugin xmlns="http://www.mozilla.org/2006/browser/search/">
|
||||||
|
<ShortName>Youtube local</ShortName>
|
||||||
|
<Description>no CIA shit in the background</Description>
|
||||||
|
<InputEncoding>UTF-8</InputEncoding>
|
||||||
|
<Image width="16" height="16">data:image/x-icon;base64,AAABAAEAEBAAAAEACAAlAgAAFgAAAIlQTkcNChoKAAAADUlIRFIAAAAQAAAAEAgGAAAAH/P/YQAAAexJREFUOI2lkzFPmlEUhp/73fshtCUCRtvQkJoKMrDQJvoHnBzUhc3EH0DUQf+As6tujo4M6mTiIDp0kGiMTRojTRNSW6o12iD4YYXv3g7Qr4O0ScM7npz7vOe+J0fk83lDF7K6eQygwkdHhI+P0bYNxmBXq5RmZui5vGQgn0f7fKi7O4oLC1gPD48BP9JpnpRKJFZXcQMB3m1u4vr9NHp76d/bo39/n4/z84ROThBa4/r91OJxMKb9BSn5mskAIOt1eq6uEFpjVyrEcjk+T0+TXlzkbTZLuFDAur9/nIFRipuREQCe7+zgBgK8mZvj/fIylVTKa/6UzXKbSnnuHkA0GnwbH/cA0a0takND3IyOEiwWAXBiMYTWjzLwtvB9bAyAwMUF8ZUVPiwtYTWbHqA6PIxoNv8OMLbN3eBga9TZWYQxaKX+AJJJhOv+AyAlT0slAG6TSX5n8+zszJugkzxA4PzcK9YSCQCk42DXaq1aGwqgfT5ebG9jpMQyUjKwu8vrtbWWqxC83NjAd31NsO2uleJnX58HCJ6eEjk8BGNQAA+RCOXJScpTU2AMwnUxlkXk4ACA+2iUSKGArNeRjkMsl6M8MYHQGtHpmIxSvFpfRzoORinQGqvZBCEwQoAxfMlkaIRCnQH/o66v8Re19MavaDNLfgAAAABJRU5ErkJggg==</Image>
|
||||||
|
|
||||||
|
<Url type="text/html" method="GET" template="http://localhost:$port_number/youtube.com/search">
|
||||||
|
<Param name="query" value="{searchTerms}"/>
|
||||||
|
</Url>
|
||||||
|
<SearchForm>http://localhost:$port_number/youtube.com/search</SearchForm>
|
||||||
|
</SearchPlugin>
|
123
youtube/playlist.py
Normal file
123
youtube/playlist.py
Normal file
@ -0,0 +1,123 @@
|
|||||||
|
from youtube import util, yt_data_extract, proto, local_playlist
|
||||||
|
from youtube import yt_app
|
||||||
|
|
||||||
|
import base64
|
||||||
|
import urllib
|
||||||
|
import json
|
||||||
|
import string
|
||||||
|
import gevent
|
||||||
|
import math
|
||||||
|
from flask import request
|
||||||
|
import flask
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def playlist_ctoken(playlist_id, offset):
|
||||||
|
|
||||||
|
offset = proto.uint(1, offset)
|
||||||
|
# this is just obfuscation as far as I can tell. It doesn't even follow protobuf
|
||||||
|
offset = b'PT:' + proto.unpadded_b64encode(offset)
|
||||||
|
offset = proto.string(15, offset)
|
||||||
|
|
||||||
|
continuation_info = proto.string( 3, proto.percent_b64encode(offset) )
|
||||||
|
|
||||||
|
playlist_id = proto.string(2, 'VL' + playlist_id )
|
||||||
|
pointless_nest = proto.string(80226972, playlist_id + continuation_info)
|
||||||
|
|
||||||
|
return base64.urlsafe_b64encode(pointless_nest).decode('ascii')
|
||||||
|
|
||||||
|
# initial request types:
|
||||||
|
# polymer_json: https://m.youtube.com/playlist?list=PLv3TTBr1W_9tppikBxAE_G6qjWdBljBHJ&pbj=1&lact=0
|
||||||
|
# ajax json: https://m.youtube.com/playlist?list=PLv3TTBr1W_9tppikBxAE_G6qjWdBljBHJ&pbj=1&lact=0 with header X-YouTube-Client-Version: 1.20180418
|
||||||
|
|
||||||
|
|
||||||
|
# continuation request types:
|
||||||
|
# polymer_json: https://m.youtube.com/playlist?&ctoken=[...]&pbj=1
|
||||||
|
# ajax json: https://m.youtube.com/playlist?action_continuation=1&ajax=1&ctoken=[...]
|
||||||
|
|
||||||
|
|
||||||
|
headers_1 = (
|
||||||
|
('Accept', '*/*'),
|
||||||
|
('Accept-Language', 'en-US,en;q=0.5'),
|
||||||
|
('X-YouTube-Client-Name', '2'),
|
||||||
|
('X-YouTube-Client-Version', '2.20180614'),
|
||||||
|
)
|
||||||
|
|
||||||
|
def playlist_first_page(playlist_id, report_text = "Retrieved playlist"):
|
||||||
|
url = 'https://m.youtube.com/playlist?list=' + playlist_id + '&pbj=1'
|
||||||
|
content = util.fetch_url(url, util.mobile_ua + headers_1, report_text=report_text, debug_name='playlist_first_page')
|
||||||
|
content = json.loads(util.uppercase_escape(content.decode('utf-8')))
|
||||||
|
|
||||||
|
return content
|
||||||
|
|
||||||
|
|
||||||
|
#https://m.youtube.com/playlist?itct=CBMQybcCIhMIptj9xJaJ2wIV2JKcCh3Idwu-&ctoken=4qmFsgI2EiRWTFBMT3kwajlBdmxWWlB0bzZJa2pLZnB1MFNjeC0tN1BHVEMaDmVnWlFWRHBEUWxFJTNE&pbj=1
|
||||||
|
def get_videos(playlist_id, page):
|
||||||
|
|
||||||
|
url = "https://m.youtube.com/playlist?ctoken=" + playlist_ctoken(playlist_id, (int(page)-1)*20) + "&pbj=1"
|
||||||
|
headers = {
|
||||||
|
'User-Agent': ' Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_1 like Mac OS X) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/10.0 Mobile/14E304 Safari/602.1',
|
||||||
|
'Accept': '*/*',
|
||||||
|
'Accept-Language': 'en-US,en;q=0.5',
|
||||||
|
'X-YouTube-Client-Name': '2',
|
||||||
|
'X-YouTube-Client-Version': '2.20180508',
|
||||||
|
}
|
||||||
|
|
||||||
|
content = util.fetch_url(url, headers, report_text="Retrieved playlist", debug_name='playlist_videos')
|
||||||
|
|
||||||
|
info = json.loads(util.uppercase_escape(content.decode('utf-8')))
|
||||||
|
return info
|
||||||
|
|
||||||
|
|
||||||
|
@yt_app.route('/playlist')
|
||||||
|
def get_playlist_page():
|
||||||
|
if 'list' not in request.args:
|
||||||
|
abort(400)
|
||||||
|
|
||||||
|
playlist_id = request.args.get('list')
|
||||||
|
page = request.args.get('page', '1')
|
||||||
|
|
||||||
|
if page == '1':
|
||||||
|
first_page_json = playlist_first_page(playlist_id)
|
||||||
|
this_page_json = first_page_json
|
||||||
|
else:
|
||||||
|
tasks = (
|
||||||
|
gevent.spawn(playlist_first_page, playlist_id, report_text="Retrieved playlist info" ),
|
||||||
|
gevent.spawn(get_videos, playlist_id, page)
|
||||||
|
)
|
||||||
|
gevent.joinall(tasks)
|
||||||
|
util.check_gevent_exceptions(*tasks)
|
||||||
|
first_page_json, this_page_json = tasks[0].value, tasks[1].value
|
||||||
|
|
||||||
|
info = yt_data_extract.extract_playlist_info(this_page_json)
|
||||||
|
if info['error']:
|
||||||
|
return flask.render_template('error.html', error_message = info['error'])
|
||||||
|
|
||||||
|
if page != '1':
|
||||||
|
info['metadata'] = yt_data_extract.extract_playlist_metadata(first_page_json)
|
||||||
|
|
||||||
|
util.prefix_urls(info['metadata'])
|
||||||
|
for item in info.get('items', ()):
|
||||||
|
util.prefix_urls(item)
|
||||||
|
util.add_extra_html_info(item)
|
||||||
|
if 'id' in item:
|
||||||
|
item['thumbnail'] = '/https://i.ytimg.com/vi/' + item['id'] + '/default.jpg'
|
||||||
|
|
||||||
|
item['url'] += '&list=' + playlist_id
|
||||||
|
if item['index']:
|
||||||
|
item['url'] += '&index=' + str(item['index'])
|
||||||
|
|
||||||
|
video_count = yt_data_extract.deep_get(info, 'metadata', 'video_count')
|
||||||
|
if video_count is None:
|
||||||
|
video_count = 40
|
||||||
|
|
||||||
|
return flask.render_template('playlist.html',
|
||||||
|
header_playlist_names = local_playlist.get_playlist_names(),
|
||||||
|
video_list = info.get('items', []),
|
||||||
|
num_pages = math.ceil(video_count/20),
|
||||||
|
parameters_dictionary = request.args,
|
||||||
|
|
||||||
|
**info['metadata']
|
||||||
|
).encode('utf-8')
|
129
youtube/proto.py
Normal file
129
youtube/proto.py
Normal file
@ -0,0 +1,129 @@
|
|||||||
|
from math import ceil
|
||||||
|
import base64
|
||||||
|
import io
|
||||||
|
|
||||||
|
def byte(n):
|
||||||
|
return bytes((n,))
|
||||||
|
|
||||||
|
|
||||||
|
def varint_encode(offset):
|
||||||
|
'''In this encoding system, for each 8-bit byte, the first bit is 1 if there are more bytes, and 0 is this is the last one.
|
||||||
|
The next 7 bits are data. These 7-bit sections represent the data in Little endian order. For example, suppose the data is
|
||||||
|
aaaaaaabbbbbbbccccccc (each of these sections is 7 bits). It will be encoded as:
|
||||||
|
1ccccccc 1bbbbbbb 0aaaaaaa
|
||||||
|
|
||||||
|
This encoding is used in youtube parameters to encode offsets and to encode the length for length-prefixed data.
|
||||||
|
See https://developers.google.com/protocol-buffers/docs/encoding#varints for more info.'''
|
||||||
|
needed_bytes = ceil(offset.bit_length()/7) or 1 # (0).bit_length() returns 0, but we need 1 in that case.
|
||||||
|
encoded_bytes = bytearray(needed_bytes)
|
||||||
|
for i in range(0, needed_bytes - 1):
|
||||||
|
encoded_bytes[i] = (offset & 127) | 128 # 7 least significant bits
|
||||||
|
offset = offset >> 7
|
||||||
|
encoded_bytes[-1] = offset & 127 # leave first bit as zero for last byte
|
||||||
|
|
||||||
|
return bytes(encoded_bytes)
|
||||||
|
|
||||||
|
|
||||||
|
def varint_decode(encoded):
|
||||||
|
decoded = 0
|
||||||
|
for i, byte in enumerate(encoded):
|
||||||
|
decoded |= (byte & 127) << 7*i
|
||||||
|
|
||||||
|
if not (byte & 128):
|
||||||
|
break
|
||||||
|
return decoded
|
||||||
|
|
||||||
|
|
||||||
|
def string(field_number, data):
|
||||||
|
data = as_bytes(data)
|
||||||
|
return _proto_field(2, field_number, varint_encode(len(data)) + data)
|
||||||
|
nested = string
|
||||||
|
|
||||||
|
def uint(field_number, value):
|
||||||
|
return _proto_field(0, field_number, varint_encode(value))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def _proto_field(wire_type, field_number, data):
|
||||||
|
''' See https://developers.google.com/protocol-buffers/docs/encoding#structure '''
|
||||||
|
return varint_encode( (field_number << 3) | wire_type) + data
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def percent_b64encode(data):
|
||||||
|
return base64.urlsafe_b64encode(data).replace(b'=', b'%3D')
|
||||||
|
|
||||||
|
|
||||||
|
def unpadded_b64encode(data):
|
||||||
|
return base64.urlsafe_b64encode(data).replace(b'=', b'')
|
||||||
|
|
||||||
|
def as_bytes(value):
|
||||||
|
if isinstance(value, str):
|
||||||
|
return value.encode('utf-8')
|
||||||
|
return value
|
||||||
|
|
||||||
|
|
||||||
|
def read_varint(data):
|
||||||
|
result = 0
|
||||||
|
i = 0
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
byte = data.read(1)[0]
|
||||||
|
except IndexError:
|
||||||
|
if i == 0:
|
||||||
|
raise EOFError()
|
||||||
|
raise Exception('Unterminated varint starting at ' + str(data.tell() - i))
|
||||||
|
result |= (byte & 127) << 7*i
|
||||||
|
if not byte & 128:
|
||||||
|
break
|
||||||
|
|
||||||
|
i += 1
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def read_group(data, end_sequence):
|
||||||
|
start = data.tell()
|
||||||
|
index = data.original.find(end_sequence, start)
|
||||||
|
if index == -1:
|
||||||
|
raise Exception('Unterminated group')
|
||||||
|
data.seek(index + len(end_sequence))
|
||||||
|
return data.original[start:index]
|
||||||
|
|
||||||
|
def read_protobuf(data):
|
||||||
|
data_original = data
|
||||||
|
data = io.BytesIO(data)
|
||||||
|
data.original = data_original
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
tag = read_varint(data)
|
||||||
|
except EOFError:
|
||||||
|
break
|
||||||
|
wire_type = tag & 7
|
||||||
|
field_number = tag >> 3
|
||||||
|
|
||||||
|
if wire_type == 0:
|
||||||
|
value = read_varint(data)
|
||||||
|
elif wire_type == 1:
|
||||||
|
value = data.read(8)
|
||||||
|
elif wire_type == 2:
|
||||||
|
length = read_varint(data)
|
||||||
|
value = data.read(length)
|
||||||
|
elif wire_type == 3:
|
||||||
|
end_bytes = encode_varint((field_number << 3) | 4)
|
||||||
|
value = read_group(data, end_bytes)
|
||||||
|
elif wire_type == 5:
|
||||||
|
value = data.read(4)
|
||||||
|
else:
|
||||||
|
raise Exception("Unknown wire type: " + str(wire_type) + ", Tag: " + bytes_to_hex(succinct_encode(tag)) + ", at position " + str(data.tell()))
|
||||||
|
yield (wire_type, field_number, value)
|
||||||
|
|
||||||
|
def parse(data):
|
||||||
|
return {field_number: value for _, field_number, value in read_protobuf(data)}
|
||||||
|
|
||||||
|
def b64_to_bytes(data):
|
||||||
|
if isinstance(data, bytes):
|
||||||
|
data = data.decode('ascii')
|
||||||
|
data = data.replace("%3D", "=")
|
||||||
|
return base64.urlsafe_b64decode(data + "="*((4 - len(data)%4)%4) )
|
||||||
|
|
105
youtube/search.py
Normal file
105
youtube/search.py
Normal file
@ -0,0 +1,105 @@
|
|||||||
|
import base64
|
||||||
|
import json
|
||||||
|
import urllib
|
||||||
|
|
||||||
|
import flask
|
||||||
|
from flask import request
|
||||||
|
from werkzeug.exceptions import abort
|
||||||
|
|
||||||
|
from youtube import util, yt_data_extract, proto
|
||||||
|
from youtube import yt_app
|
||||||
|
|
||||||
|
# Sort: 1
|
||||||
|
# Upload date: 2
|
||||||
|
# View count: 3
|
||||||
|
# Rating: 1
|
||||||
|
# Relevance: 0
|
||||||
|
# Offset: 9
|
||||||
|
# Filters: 2
|
||||||
|
# Upload date: 1
|
||||||
|
# Type: 2
|
||||||
|
# Duration: 3
|
||||||
|
|
||||||
|
|
||||||
|
features = {
|
||||||
|
'4k': 14,
|
||||||
|
'hd': 4,
|
||||||
|
'hdr': 25,
|
||||||
|
'subtitles': 5,
|
||||||
|
'creative_commons': 6,
|
||||||
|
'3d': 7,
|
||||||
|
'live': 8,
|
||||||
|
'purchased': 9,
|
||||||
|
'360': 15,
|
||||||
|
'location': 23,
|
||||||
|
}
|
||||||
|
|
||||||
|
def page_number_to_sp_parameter(page, autocorrect, sort, filters):
|
||||||
|
offset = (int(page) - 1)*20 # 20 results per page
|
||||||
|
autocorrect = proto.nested(8, proto.uint(1, 1 - int(autocorrect) ))
|
||||||
|
filters_enc = proto.nested(2, proto.uint(1, filters['time']) + proto.uint(2, filters['type']) + proto.uint(3, filters['duration']))
|
||||||
|
result = proto.uint(1, sort) + filters_enc + autocorrect + proto.uint(9, offset) + proto.string(61, b'')
|
||||||
|
return base64.urlsafe_b64encode(result).decode('ascii')
|
||||||
|
|
||||||
|
def get_search_json(query, page, autocorrect, sort, filters):
|
||||||
|
url = "https://www.youtube.com/results?search_query=" + urllib.parse.quote_plus(query)
|
||||||
|
headers = {
|
||||||
|
'Host': 'www.youtube.com',
|
||||||
|
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)',
|
||||||
|
'Accept': '*/*',
|
||||||
|
'Accept-Language': 'en-US,en;q=0.5',
|
||||||
|
'X-YouTube-Client-Name': '1',
|
||||||
|
'X-YouTube-Client-Version': '2.20180418',
|
||||||
|
}
|
||||||
|
url += "&pbj=1&sp=" + page_number_to_sp_parameter(page, autocorrect, sort, filters).replace("=", "%3D")
|
||||||
|
content = util.fetch_url(url, headers=headers, report_text="Got search results", debug_name='search_results')
|
||||||
|
info = json.loads(content)
|
||||||
|
return info
|
||||||
|
|
||||||
|
|
||||||
|
@yt_app.route('/search')
|
||||||
|
def get_search_page():
|
||||||
|
if len(request.args) == 0:
|
||||||
|
return flask.render_template('base.html', title="Search")
|
||||||
|
|
||||||
|
if 'query' not in request.args:
|
||||||
|
abort(400)
|
||||||
|
|
||||||
|
query = request.args.get("query")
|
||||||
|
page = request.args.get("page", "1")
|
||||||
|
autocorrect = int(request.args.get("autocorrect", "1"))
|
||||||
|
sort = int(request.args.get("sort", "0"))
|
||||||
|
filters = {}
|
||||||
|
filters['time'] = int(request.args.get("time", "0"))
|
||||||
|
filters['type'] = int(request.args.get("type", "0"))
|
||||||
|
filters['duration'] = int(request.args.get("duration", "0"))
|
||||||
|
polymer_json = get_search_json(query, page, autocorrect, sort, filters)
|
||||||
|
|
||||||
|
search_info = yt_data_extract.extract_search_info(polymer_json)
|
||||||
|
if search_info['error']:
|
||||||
|
return flask.render_template('error.html', error_message = search_info['error'])
|
||||||
|
|
||||||
|
for extract_item_info in search_info['items']:
|
||||||
|
util.prefix_urls(extract_item_info)
|
||||||
|
util.add_extra_html_info(extract_item_info)
|
||||||
|
|
||||||
|
corrections = search_info['corrections']
|
||||||
|
if corrections['type'] == 'did_you_mean':
|
||||||
|
corrected_query_string = request.args.to_dict(flat=False)
|
||||||
|
corrected_query_string['query'] = [corrections['corrected_query']]
|
||||||
|
corrections['corrected_query_url'] = util.URL_ORIGIN + '/search?' + urllib.parse.urlencode(corrected_query_string, doseq=True)
|
||||||
|
elif corrections['type'] == 'showing_results_for':
|
||||||
|
no_autocorrect_query_string = request.args.to_dict(flat=False)
|
||||||
|
no_autocorrect_query_string['autocorrect'] = ['0']
|
||||||
|
no_autocorrect_query_url = util.URL_ORIGIN + '/search?' + urllib.parse.urlencode(no_autocorrect_query_string, doseq=True)
|
||||||
|
corrections['original_query_url'] = no_autocorrect_query_url
|
||||||
|
|
||||||
|
return flask.render_template('search.html',
|
||||||
|
header_playlist_names = local_playlist.get_playlist_names(),
|
||||||
|
query = query,
|
||||||
|
estimated_results = search_info['estimated_results'],
|
||||||
|
estimated_pages = search_info['estimated_pages'],
|
||||||
|
corrections = search_info['corrections'],
|
||||||
|
results = search_info['items'],
|
||||||
|
parameters_dictionary = request.args,
|
||||||
|
)
|
397
youtube/util.py
Normal file
397
youtube/util.py
Normal file
@ -0,0 +1,397 @@
|
|||||||
|
import gzip
|
||||||
|
|
||||||
|
from youtube import yt_data_extract
|
||||||
|
|
||||||
|
try:
|
||||||
|
import brotli
|
||||||
|
have_brotli = True
|
||||||
|
except ImportError:
|
||||||
|
have_brotli = False
|
||||||
|
import urllib.parse
|
||||||
|
import re
|
||||||
|
import time
|
||||||
|
import os
|
||||||
|
import json
|
||||||
|
import gevent
|
||||||
|
import gevent.queue
|
||||||
|
import gevent.lock
|
||||||
|
|
||||||
|
# The trouble with the requests library: It ships its own certificate bundle via certifi
|
||||||
|
# instead of using the system certificate store, meaning self-signed certificates
|
||||||
|
# configured by the user will not work. Some draconian networks block TLS unless a corporate
|
||||||
|
# certificate is installed on the system. Additionally, some users install a self signed cert
|
||||||
|
# in order to use programs to modify or monitor requests made by programs on the system.
|
||||||
|
|
||||||
|
# Finally, certificates expire and need to be updated, or are sometimes revoked. Sometimes
|
||||||
|
# certificate authorites go rogue and need to be untrusted. Since we are going through Tor exit nodes,
|
||||||
|
# this becomes all the more important. A rogue CA could issue a fake certificate for accounts.google.com, and a
|
||||||
|
# malicious exit node could use this to decrypt traffic when logging in and retrieve passwords. Examples:
|
||||||
|
# https://www.engadget.com/2015/10/29/google-warns-symantec-over-certificates/
|
||||||
|
# https://nakedsecurity.sophos.com/2013/12/09/serious-security-google-finds-fake-but-trusted-ssl-certificates-for-its-domains-made-in-france/
|
||||||
|
|
||||||
|
# In the requests documentation it says:
|
||||||
|
# "Before version 2.16, Requests bundled a set of root CAs that it trusted, sourced from the Mozilla trust store.
|
||||||
|
# The certificates were only updated once for each Requests version. When certifi was not installed,
|
||||||
|
# this led to extremely out-of-date certificate bundles when using significantly older versions of Requests.
|
||||||
|
# For the sake of security we recommend upgrading certifi frequently!"
|
||||||
|
# (http://docs.python-requests.org/en/master/user/advanced/#ca-certificates)
|
||||||
|
|
||||||
|
# Expecting users to remember to manually update certifi on Linux isn't reasonable in my view.
|
||||||
|
# On windows, this is even worse since I am distributing all dependencies. This program is not
|
||||||
|
# updated frequently, and using requests would lead to outdated certificates. Certificates
|
||||||
|
# should be updated with OS updates, instead of thousands of developers of different programs
|
||||||
|
# being expected to do this correctly 100% of the time.
|
||||||
|
|
||||||
|
# There is hope that this might be fixed eventually:
|
||||||
|
# https://github.com/kennethreitz/requests/issues/2966
|
||||||
|
|
||||||
|
# Until then, I will use a mix of urllib3 and urllib.
|
||||||
|
import urllib3
|
||||||
|
import urllib3.contrib.socks
|
||||||
|
|
||||||
|
URL_ORIGIN = "/https://www.youtube.com"
|
||||||
|
|
||||||
|
connection_pool = urllib3.PoolManager(cert_reqs = 'CERT_REQUIRED')
|
||||||
|
|
||||||
|
def get_pool(use_tor):
|
||||||
|
return connection_pool
|
||||||
|
|
||||||
|
class HTTPAsymmetricCookieProcessor(urllib.request.BaseHandler):
|
||||||
|
'''Separate cookiejars for receiving and sending'''
|
||||||
|
def __init__(self, cookiejar_send=None, cookiejar_receive=None):
|
||||||
|
self.cookiejar_send = cookiejar_send
|
||||||
|
self.cookiejar_receive = cookiejar_receive
|
||||||
|
|
||||||
|
def http_request(self, request):
|
||||||
|
if self.cookiejar_send is not None:
|
||||||
|
self.cookiejar_send.add_cookie_header(request)
|
||||||
|
return request
|
||||||
|
|
||||||
|
def http_response(self, request, response):
|
||||||
|
if self.cookiejar_receive is not None:
|
||||||
|
self.cookiejar_receive.extract_cookies(response, request)
|
||||||
|
return response
|
||||||
|
|
||||||
|
https_request = http_request
|
||||||
|
https_response = http_response
|
||||||
|
|
||||||
|
class FetchError(Exception):
|
||||||
|
def __init__(self, code, reason='', ip=None):
|
||||||
|
Exception.__init__(self, 'HTTP error during request: ' + code + ' ' + reason)
|
||||||
|
self.code = code
|
||||||
|
self.reason = reason
|
||||||
|
self.ip = ip
|
||||||
|
|
||||||
|
def decode_content(content, encoding_header):
|
||||||
|
encodings = encoding_header.replace(' ', '').split(',')
|
||||||
|
for encoding in reversed(encodings):
|
||||||
|
if encoding == 'identity':
|
||||||
|
continue
|
||||||
|
if encoding == 'br':
|
||||||
|
content = brotli.decompress(content)
|
||||||
|
elif encoding == 'gzip':
|
||||||
|
content = gzip.decompress(content)
|
||||||
|
return content
|
||||||
|
|
||||||
|
def fetch_url_response(url, headers=(), timeout=15, data=None,
|
||||||
|
cookiejar_send=None, cookiejar_receive=None,
|
||||||
|
use_tor=True, max_redirects=None):
|
||||||
|
'''
|
||||||
|
returns response, cleanup_function
|
||||||
|
When cookiejar_send is set to a CookieJar object,
|
||||||
|
those cookies will be sent in the request (but cookies in response will not be merged into it)
|
||||||
|
When cookiejar_receive is set to a CookieJar object,
|
||||||
|
cookies received in the response will be merged into the object (nothing will be sent from it)
|
||||||
|
When both are set to the same object, cookies will be sent from the object,
|
||||||
|
and response cookies will be merged into it.
|
||||||
|
'''
|
||||||
|
headers = dict(headers) # Note: Calling dict() on a dict will make a copy
|
||||||
|
if have_brotli:
|
||||||
|
headers['Accept-Encoding'] = 'gzip, br'
|
||||||
|
else:
|
||||||
|
headers['Accept-Encoding'] = 'gzip'
|
||||||
|
|
||||||
|
# prevent python version being leaked by urllib if User-Agent isn't provided
|
||||||
|
# (urllib will use ex. Python-urllib/3.6 otherwise)
|
||||||
|
if 'User-Agent' not in headers and 'user-agent' not in headers and 'User-agent' not in headers:
|
||||||
|
headers['User-Agent'] = 'Python-urllib'
|
||||||
|
|
||||||
|
method = "GET"
|
||||||
|
if data is not None:
|
||||||
|
method = "POST"
|
||||||
|
if isinstance(data, str):
|
||||||
|
data = data.encode('ascii')
|
||||||
|
elif not isinstance(data, bytes):
|
||||||
|
data = urllib.parse.urlencode(data).encode('ascii')
|
||||||
|
|
||||||
|
if cookiejar_send is not None or cookiejar_receive is not None: # Use urllib
|
||||||
|
req = urllib.request.Request(url, data=data, headers=headers)
|
||||||
|
|
||||||
|
cookie_processor = HTTPAsymmetricCookieProcessor(cookiejar_send=cookiejar_send, cookiejar_receive=cookiejar_receive)
|
||||||
|
opener = urllib.request.build_opener(cookie_processor)
|
||||||
|
|
||||||
|
response = opener.open(req, timeout=timeout)
|
||||||
|
cleanup_func = (lambda r: None)
|
||||||
|
|
||||||
|
else: # Use a urllib3 pool. Cookies can't be used since urllib3 doesn't have easy support for them.
|
||||||
|
# default: Retry.DEFAULT = Retry(3)
|
||||||
|
# (in connectionpool.py in urllib3)
|
||||||
|
# According to the documentation for urlopen, a redirect counts as a
|
||||||
|
# retry. So there are 3 redirects max by default.
|
||||||
|
if max_redirects:
|
||||||
|
retries = urllib3.Retry(3+max_redirects, redirect=max_redirects)
|
||||||
|
else:
|
||||||
|
retries = urllib3.Retry(3)
|
||||||
|
pool = get_pool(use_tor)
|
||||||
|
response = pool.request(method, url, headers=headers,
|
||||||
|
timeout=timeout, preload_content=False,
|
||||||
|
decode_content=False, retries=retries)
|
||||||
|
cleanup_func = (lambda r: r.release_conn())
|
||||||
|
|
||||||
|
return response, cleanup_func
|
||||||
|
|
||||||
|
def fetch_url(url, headers=(), timeout=15, report_text=None, data=None,
|
||||||
|
cookiejar_send=None, cookiejar_receive=None, use_tor=True,
|
||||||
|
debug_name=None):
|
||||||
|
start_time = time.time()
|
||||||
|
|
||||||
|
response, cleanup_func = fetch_url_response(
|
||||||
|
url, headers, timeout=timeout,
|
||||||
|
cookiejar_send=cookiejar_send, cookiejar_receive=cookiejar_receive,
|
||||||
|
use_tor=use_tor)
|
||||||
|
response_time = time.time()
|
||||||
|
|
||||||
|
content = response.read()
|
||||||
|
read_finish = time.time()
|
||||||
|
|
||||||
|
cleanup_func(response) # release_connection for urllib3
|
||||||
|
|
||||||
|
if (response.status == 429
|
||||||
|
and content.startswith(b'<!DOCTYPE')
|
||||||
|
and b'Our systems have detected unusual traffic' in content):
|
||||||
|
ip = re.search(br'IP address: ((?:[\da-f]*:)+[\da-f]+|(?:\d+\.)+\d+)',
|
||||||
|
content)
|
||||||
|
ip = ip.group(1).decode('ascii') if ip else None
|
||||||
|
raise FetchError('429', reason=response.reason, ip=ip)
|
||||||
|
|
||||||
|
elif response.status >= 400:
|
||||||
|
raise FetchError(str(response.status), reason=response.reason, ip=None)
|
||||||
|
|
||||||
|
if report_text:
|
||||||
|
print(report_text, ' Latency:', round(response_time - start_time,3), ' Read time:', round(read_finish - response_time,3))
|
||||||
|
content = decode_content(content, response.getheader('Content-Encoding', default='identity'))
|
||||||
|
return content
|
||||||
|
|
||||||
|
def head(url, use_tor=False, report_text=None, max_redirects=10):
|
||||||
|
pool = get_pool(use_tor)
|
||||||
|
start_time = time.time()
|
||||||
|
|
||||||
|
# default: Retry.DEFAULT = Retry(3)
|
||||||
|
# (in connectionpool.py in urllib3)
|
||||||
|
# According to the documentation for urlopen, a redirect counts as a retry
|
||||||
|
# So there are 3 redirects max by default. Let's change that
|
||||||
|
# to 10 since googlevideo redirects a lot.
|
||||||
|
retries = urllib3.Retry(3+max_redirects, redirect=max_redirects,
|
||||||
|
raise_on_redirect=False)
|
||||||
|
headers = {'User-Agent': 'Python-urllib'}
|
||||||
|
response = pool.request('HEAD', url, headers=headers, retries=retries)
|
||||||
|
if report_text:
|
||||||
|
print(report_text, ' Latency:', round(time.time() - start_time,3))
|
||||||
|
return response
|
||||||
|
|
||||||
|
mobile_user_agent = 'Mozilla/5.0 (Linux; Android 7.0; Redmi Note 4 Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Mobile Safari/537.36'
|
||||||
|
mobile_ua = (('User-Agent', mobile_user_agent),)
|
||||||
|
desktop_user_agent = 'Mozilla/5.0 (Windows NT 6.1; rv:52.0) Gecko/20100101 Firefox/52.0'
|
||||||
|
desktop_ua = (('User-Agent', desktop_user_agent),)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
class RateLimitedQueue(gevent.queue.Queue):
|
||||||
|
''' Does initial_burst (def. 30) at first, then alternates between waiting waiting_period (def. 5) seconds and doing subsequent_bursts (def. 10) queries. After 5 seconds with nothing left in the queue, resets rate limiting. '''
|
||||||
|
|
||||||
|
def __init__(self, initial_burst=30, waiting_period=5, subsequent_bursts=10):
|
||||||
|
self.initial_burst = initial_burst
|
||||||
|
self.waiting_period = waiting_period
|
||||||
|
self.subsequent_bursts = subsequent_bursts
|
||||||
|
|
||||||
|
self.count_since_last_wait = 0
|
||||||
|
self.surpassed_initial = False
|
||||||
|
|
||||||
|
self.lock = gevent.lock.BoundedSemaphore(1)
|
||||||
|
self.currently_empty = False
|
||||||
|
self.empty_start = 0
|
||||||
|
gevent.queue.Queue.__init__(self)
|
||||||
|
|
||||||
|
|
||||||
|
def get(self):
|
||||||
|
self.lock.acquire() # blocks if another greenlet currently has the lock
|
||||||
|
if self.count_since_last_wait >= self.subsequent_bursts and self.surpassed_initial:
|
||||||
|
gevent.sleep(self.waiting_period)
|
||||||
|
self.count_since_last_wait = 0
|
||||||
|
|
||||||
|
elif self.count_since_last_wait >= self.initial_burst and not self.surpassed_initial:
|
||||||
|
self.surpassed_initial = True
|
||||||
|
gevent.sleep(self.waiting_period)
|
||||||
|
self.count_since_last_wait = 0
|
||||||
|
|
||||||
|
self.count_since_last_wait += 1
|
||||||
|
|
||||||
|
if not self.currently_empty and self.empty():
|
||||||
|
self.currently_empty = True
|
||||||
|
self.empty_start = time.monotonic()
|
||||||
|
|
||||||
|
item = gevent.queue.Queue.get(self) # blocks when nothing left
|
||||||
|
|
||||||
|
if self.currently_empty:
|
||||||
|
if time.monotonic() - self.empty_start >= self.waiting_period:
|
||||||
|
self.count_since_last_wait = 0
|
||||||
|
self.surpassed_initial = False
|
||||||
|
|
||||||
|
self.currently_empty = False
|
||||||
|
|
||||||
|
self.lock.release()
|
||||||
|
|
||||||
|
return item
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def download_thumbnail(save_directory, video_id):
|
||||||
|
url = "https://i.ytimg.com/vi/" + video_id + "/mqdefault.jpg"
|
||||||
|
save_location = os.path.join(save_directory, video_id + ".jpg")
|
||||||
|
try:
|
||||||
|
thumbnail = fetch_url(url, report_text="Saved thumbnail: " + video_id)
|
||||||
|
except urllib.error.HTTPError as e:
|
||||||
|
print("Failed to download thumbnail for " + video_id + ": " + str(e))
|
||||||
|
return False
|
||||||
|
try:
|
||||||
|
f = open(save_location, 'wb')
|
||||||
|
except FileNotFoundError:
|
||||||
|
os.makedirs(save_directory, exist_ok = True)
|
||||||
|
f = open(save_location, 'wb')
|
||||||
|
f.write(thumbnail)
|
||||||
|
f.close()
|
||||||
|
return True
|
||||||
|
|
||||||
|
def download_thumbnails(save_directory, ids):
|
||||||
|
if not isinstance(ids, (list, tuple)):
|
||||||
|
ids = list(ids)
|
||||||
|
# only do 5 at a time
|
||||||
|
# do the n where n is divisible by 5
|
||||||
|
i = -1
|
||||||
|
for i in range(0, int(len(ids)/5) - 1 ):
|
||||||
|
gevent.joinall([gevent.spawn(download_thumbnail, save_directory, ids[j]) for j in range(i*5, i*5 + 5)])
|
||||||
|
# do the remainders (< 5)
|
||||||
|
gevent.joinall([gevent.spawn(download_thumbnail, save_directory, ids[j]) for j in range(i*5 + 5, len(ids))])
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def dict_add(*dicts):
|
||||||
|
for dictionary in dicts[1:]:
|
||||||
|
dicts[0].update(dictionary)
|
||||||
|
return dicts[0]
|
||||||
|
|
||||||
|
def video_id(url):
|
||||||
|
url_parts = urllib.parse.urlparse(url)
|
||||||
|
return urllib.parse.parse_qs(url_parts.query)['v'][0]
|
||||||
|
|
||||||
|
|
||||||
|
# default, sddefault, mqdefault, hqdefault, hq720
|
||||||
|
def get_thumbnail_url(video_id):
|
||||||
|
return "/i.ytimg.com/vi/" + video_id + "/mqdefault.jpg"
|
||||||
|
|
||||||
|
def seconds_to_timestamp(seconds):
|
||||||
|
seconds = int(seconds)
|
||||||
|
hours, seconds = divmod(seconds,3600)
|
||||||
|
minutes, seconds = divmod(seconds,60)
|
||||||
|
if hours != 0:
|
||||||
|
timestamp = str(hours) + ":"
|
||||||
|
timestamp += str(minutes).zfill(2) # zfill pads with zeros
|
||||||
|
else:
|
||||||
|
timestamp = str(minutes)
|
||||||
|
|
||||||
|
timestamp += ":" + str(seconds).zfill(2)
|
||||||
|
return timestamp
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def update_query_string(query_string, items):
|
||||||
|
parameters = urllib.parse.parse_qs(query_string)
|
||||||
|
parameters.update(items)
|
||||||
|
return urllib.parse.urlencode(parameters, doseq=True)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def uppercase_escape(s):
|
||||||
|
return re.sub(
|
||||||
|
r'\\U([0-9a-fA-F]{8})',
|
||||||
|
lambda m: chr(int(m.group(1), base=16)), s)
|
||||||
|
|
||||||
|
def prefix_url(url):
|
||||||
|
if url is None:
|
||||||
|
return None
|
||||||
|
url = url.lstrip('/') # some urls have // before them, which has a special meaning
|
||||||
|
return '/' + url
|
||||||
|
|
||||||
|
def left_remove(string, substring):
|
||||||
|
'''removes substring from the start of string, if present'''
|
||||||
|
if string.startswith(substring):
|
||||||
|
return string[len(substring):]
|
||||||
|
return string
|
||||||
|
|
||||||
|
def concat_or_none(*strings):
|
||||||
|
'''Concatenates strings. Returns None if any of the arguments are None'''
|
||||||
|
result = ''
|
||||||
|
for string in strings:
|
||||||
|
if string is None:
|
||||||
|
return None
|
||||||
|
result += string
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def prefix_urls(item):
|
||||||
|
try:
|
||||||
|
item['thumbnail'] = prefix_url(item['thumbnail'])
|
||||||
|
except KeyError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
item['author_url'] = prefix_url(item['author_url'])
|
||||||
|
except KeyError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
def add_extra_html_info(item):
|
||||||
|
if item['type'] == 'video':
|
||||||
|
item['url'] = (URL_ORIGIN + '/watch?v=' + item['id']) if item.get('id') else None
|
||||||
|
|
||||||
|
video_info = {}
|
||||||
|
for key in ('id', 'title', 'author', 'duration'):
|
||||||
|
try:
|
||||||
|
video_info[key] = item[key]
|
||||||
|
except KeyError:
|
||||||
|
video_info[key] = ''
|
||||||
|
|
||||||
|
item['video_info'] = json.dumps(video_info)
|
||||||
|
|
||||||
|
elif item['type'] == 'playlist':
|
||||||
|
item['url'] = (URL_ORIGIN + '/playlist?list=' + item['id']) if item.get('id') else None
|
||||||
|
elif item['type'] == 'channel':
|
||||||
|
item['url'] = (URL_ORIGIN + "/channel/" + item['id']) if item.get('id') else None
|
||||||
|
|
||||||
|
def parse_info_prepare_for_html(renderer, additional_info={}):
|
||||||
|
item = yt_data_extract.extract_item_info(renderer, additional_info)
|
||||||
|
prefix_urls(item)
|
||||||
|
add_extra_html_info(item)
|
||||||
|
|
||||||
|
return item
|
||||||
|
|
||||||
|
def check_gevent_exceptions(*tasks):
|
||||||
|
for task in tasks:
|
||||||
|
if task.exception:
|
||||||
|
raise task.exception
|
||||||
|
|
61
youtube/utils.py
Normal file
61
youtube/utils.py
Normal file
@ -0,0 +1,61 @@
|
|||||||
|
import urllib
|
||||||
|
from flask import Markup
|
||||||
|
import bleach
|
||||||
|
def get_description_snippet_text(ds):
|
||||||
|
string = ""
|
||||||
|
for t in ds:
|
||||||
|
try:
|
||||||
|
if t['bold']:
|
||||||
|
text = "<b>"+t['text']+"</b>"
|
||||||
|
else:
|
||||||
|
text = t['text']
|
||||||
|
except:
|
||||||
|
text = t['text']
|
||||||
|
string = string + text
|
||||||
|
return string
|
||||||
|
|
||||||
|
|
||||||
|
def concat_texts(strings):
|
||||||
|
'''Concatenates strings. Returns None if any of the arguments are None'''
|
||||||
|
result = ''
|
||||||
|
for string in strings:
|
||||||
|
if string['text'] is None:
|
||||||
|
return None
|
||||||
|
result += string['text']
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def parse_comment(raw_comment):
|
||||||
|
cmnt = {}
|
||||||
|
imgHostName = urllib.parse.urlparse(raw_comment['author_avatar'][1:]).netloc
|
||||||
|
cmnt['author'] = raw_comment['author']
|
||||||
|
cmnt['thumbnail'] = raw_comment['author_avatar'].replace("https://{}".format(imgHostName),"")[1:] + "?host=" + imgHostName
|
||||||
|
|
||||||
|
print(cmnt['thumbnail'])
|
||||||
|
cmnt['channel'] = raw_comment['author_url']
|
||||||
|
cmnt['text'] = Markup(bleach.linkify(concat_texts(raw_comment['text']).replace("\n", "<br>")))
|
||||||
|
cmnt['date'] = raw_comment['time_published']
|
||||||
|
|
||||||
|
try:
|
||||||
|
cmnt['creatorHeart'] = raw_comment['creatorHeart']['creatorHeartRenderer']['creatorThumbnail']['thumbnails'][0][
|
||||||
|
'url']
|
||||||
|
except:
|
||||||
|
cmnt['creatorHeart'] = False
|
||||||
|
|
||||||
|
try:
|
||||||
|
cmnt['likes'] = raw_comment['like_count']
|
||||||
|
except:
|
||||||
|
cmnt['likes'] = 0
|
||||||
|
|
||||||
|
try:
|
||||||
|
cmnt['replies'] = raw_comment['reply_count']
|
||||||
|
except:
|
||||||
|
cmnt['replies'] = 0
|
||||||
|
return cmnt
|
||||||
|
|
||||||
|
|
||||||
|
def post_process_comments_info(comments_info):
|
||||||
|
comments = []
|
||||||
|
for comment in comments_info['comments']:
|
||||||
|
comments.append(parse_comment(comment))
|
||||||
|
return comments
|
246
youtube/watch.py
Normal file
246
youtube/watch.py
Normal file
@ -0,0 +1,246 @@
|
|||||||
|
import json
|
||||||
|
import math
|
||||||
|
import traceback
|
||||||
|
import urllib
|
||||||
|
|
||||||
|
from youtube import util, yt_data_extract
|
||||||
|
|
||||||
|
|
||||||
|
def get_video_sources(info, tor_bypass=False):
|
||||||
|
video_sources = []
|
||||||
|
max_resolution = "720"
|
||||||
|
for fmt in info['formats']:
|
||||||
|
if not all(fmt[attr] for attr in ('quality', 'width', 'ext', 'url')):
|
||||||
|
continue
|
||||||
|
if fmt['acodec'] and fmt['vcodec'] and fmt['height'] <= max_resolution:
|
||||||
|
video_sources.append({
|
||||||
|
'src': fmt['url'],
|
||||||
|
'type': 'video/' + fmt['ext'],
|
||||||
|
'quality': fmt['quality'],
|
||||||
|
'height': fmt['height'],
|
||||||
|
'width': fmt['width'],
|
||||||
|
})
|
||||||
|
|
||||||
|
#### order the videos sources so the preferred resolution is first ###
|
||||||
|
|
||||||
|
video_sources.sort(key=lambda source: source['quality'], reverse=True)
|
||||||
|
|
||||||
|
return video_sources
|
||||||
|
|
||||||
|
def make_caption_src(info, lang, auto=False, trans_lang=None):
|
||||||
|
label = lang
|
||||||
|
if auto:
|
||||||
|
label += ' (Automatic)'
|
||||||
|
if trans_lang:
|
||||||
|
label += ' -> ' + trans_lang
|
||||||
|
return {
|
||||||
|
'url': '/' + yt_data_extract.get_caption_url(info, lang, 'vtt', auto, trans_lang),
|
||||||
|
'label': label,
|
||||||
|
'srclang': trans_lang[0:2] if trans_lang else lang[0:2],
|
||||||
|
'on': False,
|
||||||
|
}
|
||||||
|
|
||||||
|
def lang_in(lang, sequence):
|
||||||
|
'''Tests if the language is in sequence, with e.g. en and en-US considered the same'''
|
||||||
|
if lang is None:
|
||||||
|
return False
|
||||||
|
lang = lang[0:2]
|
||||||
|
return lang in (l[0:2] for l in sequence)
|
||||||
|
|
||||||
|
def lang_eq(lang1, lang2):
|
||||||
|
'''Tests if two iso 639-1 codes are equal, with en and en-US considered the same.
|
||||||
|
Just because the codes are equal does not mean the dialects are mutually intelligible, but this will have to do for now without a complex language model'''
|
||||||
|
if lang1 is None or lang2 is None:
|
||||||
|
return False
|
||||||
|
return lang1[0:2] == lang2[0:2]
|
||||||
|
|
||||||
|
def equiv_lang_in(lang, sequence):
|
||||||
|
'''Extracts a language in sequence which is equivalent to lang.
|
||||||
|
e.g. if lang is en, extracts en-GB from sequence.
|
||||||
|
Necessary because if only a specific variant like en-GB is available, can't ask Youtube for simply en. Need to get the available variant.'''
|
||||||
|
lang = lang[0:2]
|
||||||
|
for l in sequence:
|
||||||
|
if l[0:2] == lang:
|
||||||
|
return l
|
||||||
|
return None
|
||||||
|
|
||||||
|
def get_subtitle_sources(info):
|
||||||
|
'''Returns these sources, ordered from least to most intelligible:
|
||||||
|
native_video_lang (Automatic)
|
||||||
|
foreign_langs (Manual)
|
||||||
|
native_video_lang (Automatic) -> pref_lang
|
||||||
|
foreign_langs (Manual) -> pref_lang
|
||||||
|
native_video_lang (Manual) -> pref_lang
|
||||||
|
pref_lang (Automatic)
|
||||||
|
pref_lang (Manual)'''
|
||||||
|
sources = []
|
||||||
|
pref_lang = 'en'
|
||||||
|
native_video_lang = None
|
||||||
|
if info['automatic_caption_languages']:
|
||||||
|
native_video_lang = info['automatic_caption_languages'][0]
|
||||||
|
|
||||||
|
highest_fidelity_is_manual = False
|
||||||
|
|
||||||
|
# Sources are added in very specific order outlined above
|
||||||
|
# More intelligible sources are put further down to avoid browser bug when there are too many languages
|
||||||
|
# (in firefox, it is impossible to select a language near the top of the list because it is cut off)
|
||||||
|
|
||||||
|
# native_video_lang (Automatic)
|
||||||
|
if native_video_lang and not lang_eq(native_video_lang, pref_lang):
|
||||||
|
sources.append(make_caption_src(info, native_video_lang, auto=True))
|
||||||
|
|
||||||
|
# foreign_langs (Manual)
|
||||||
|
for lang in info['manual_caption_languages']:
|
||||||
|
if not lang_eq(lang, pref_lang):
|
||||||
|
sources.append(make_caption_src(info, lang))
|
||||||
|
|
||||||
|
if (lang_in(pref_lang, info['translation_languages'])
|
||||||
|
and not lang_in(pref_lang, info['automatic_caption_languages'])
|
||||||
|
and not lang_in(pref_lang, info['manual_caption_languages'])):
|
||||||
|
# native_video_lang (Automatic) -> pref_lang
|
||||||
|
if native_video_lang and not lang_eq(pref_lang, native_video_lang):
|
||||||
|
sources.append(make_caption_src(info, native_video_lang, auto=True, trans_lang=pref_lang))
|
||||||
|
|
||||||
|
# foreign_langs (Manual) -> pref_lang
|
||||||
|
for lang in info['manual_caption_languages']:
|
||||||
|
if not lang_eq(lang, native_video_lang) and not lang_eq(lang, pref_lang):
|
||||||
|
sources.append(make_caption_src(info, lang, trans_lang=pref_lang))
|
||||||
|
|
||||||
|
# native_video_lang (Manual) -> pref_lang
|
||||||
|
if lang_in(native_video_lang, info['manual_caption_languages']):
|
||||||
|
sources.append(make_caption_src(info, native_video_lang, trans_lang=pref_lang))
|
||||||
|
|
||||||
|
# pref_lang (Automatic)
|
||||||
|
if lang_in(pref_lang, info['automatic_caption_languages']):
|
||||||
|
sources.append(make_caption_src(info, equiv_lang_in(pref_lang, info['automatic_caption_languages']), auto=True))
|
||||||
|
|
||||||
|
# pref_lang (Manual)
|
||||||
|
if lang_in(pref_lang, info['manual_caption_languages']):
|
||||||
|
sources.append(make_caption_src(info, equiv_lang_in(pref_lang, info['manual_caption_languages'])))
|
||||||
|
highest_fidelity_is_manual = True
|
||||||
|
if len(sources) == 0:
|
||||||
|
assert len(info['automatic_caption_languages']) == 0 and len(info['manual_caption_languages']) == 0
|
||||||
|
|
||||||
|
return sources
|
||||||
|
|
||||||
|
|
||||||
|
def get_ordered_music_list_attributes(music_list):
|
||||||
|
# get the set of attributes which are used by atleast 1 track
|
||||||
|
# so there isn't an empty, extraneous album column which no tracks use, for example
|
||||||
|
used_attributes = set()
|
||||||
|
for track in music_list:
|
||||||
|
used_attributes = used_attributes | track.keys()
|
||||||
|
|
||||||
|
# now put them in the right order
|
||||||
|
ordered_attributes = []
|
||||||
|
for attribute in ('Artist', 'Title', 'Album'):
|
||||||
|
if attribute.lower() in used_attributes:
|
||||||
|
ordered_attributes.append(attribute)
|
||||||
|
|
||||||
|
return ordered_attributes
|
||||||
|
|
||||||
|
headers = (
|
||||||
|
('Accept', '*/*'),
|
||||||
|
('Accept-Language', 'en-US,en;q=0.5'),
|
||||||
|
('X-YouTube-Client-Name', '2'),
|
||||||
|
('X-YouTube-Client-Version', '2.20180830'),
|
||||||
|
) + util.mobile_ua
|
||||||
|
def extract_info(video_id, use_invidious, playlist_id=None, index=None):
|
||||||
|
# bpctr=9999999999 will bypass are-you-sure dialogs for controversial
|
||||||
|
# videos
|
||||||
|
url = 'https://m.youtube.com/watch?v=' + video_id + '&pbj=1&bpctr=9999999999'
|
||||||
|
if playlist_id:
|
||||||
|
url += '&list=' + playlist_id
|
||||||
|
if index:
|
||||||
|
url += '&index=' + index
|
||||||
|
polymer_json = util.fetch_url(url, headers=headers, debug_name='watch')
|
||||||
|
polymer_json = polymer_json.decode('utf-8')
|
||||||
|
# TODO: Decide whether this should be done in yt_data_extract.extract_watch_info
|
||||||
|
try:
|
||||||
|
polymer_json = json.loads(polymer_json)
|
||||||
|
except json.decoder.JSONDecodeError:
|
||||||
|
traceback.print_exc()
|
||||||
|
return {'error': 'Failed to parse json response'}
|
||||||
|
info = yt_data_extract.extract_watch_info(polymer_json)
|
||||||
|
|
||||||
|
# age restriction bypass
|
||||||
|
if info['age_restricted']:
|
||||||
|
print('Fetching age restriction bypass page')
|
||||||
|
data = {
|
||||||
|
'video_id': video_id,
|
||||||
|
'eurl': 'https://youtube.googleapis.com/v/' + video_id,
|
||||||
|
}
|
||||||
|
url = 'https://www.youtube.com/get_video_info?' + urllib.parse.urlencode(data)
|
||||||
|
video_info_page = util.fetch_url(url, debug_name='get_video_info', report_text='Fetched age restriction bypass page').decode('utf-8')
|
||||||
|
yt_data_extract.update_with_age_restricted_info(info, video_info_page)
|
||||||
|
# check if urls ready (non-live format) in former livestream
|
||||||
|
# urls not ready if all of them have no filesize
|
||||||
|
if info['was_live']:
|
||||||
|
info['urls_ready'] = False
|
||||||
|
for fmt in info['formats']:
|
||||||
|
if fmt['file_size'] is not None:
|
||||||
|
info['urls_ready'] = True
|
||||||
|
else:
|
||||||
|
info['urls_ready'] = True
|
||||||
|
|
||||||
|
# livestream urls
|
||||||
|
# sometimes only the livestream urls work soon after the livestream is over
|
||||||
|
if (info['hls_manifest_url']
|
||||||
|
and (info['live'] or not info['formats'] or not info['urls_ready'])
|
||||||
|
):
|
||||||
|
manifest = util.fetch_url(info['hls_manifest_url'],
|
||||||
|
debug_name='hls_manifest.m3u8',
|
||||||
|
report_text='Fetched hls manifest'
|
||||||
|
).decode('utf-8')
|
||||||
|
|
||||||
|
info['hls_formats'], err = yt_data_extract.extract_hls_formats(manifest)
|
||||||
|
if not err:
|
||||||
|
info['playability_error'] = None
|
||||||
|
for fmt in info['hls_formats']:
|
||||||
|
fmt['video_quality'] = video_quality_string(fmt)
|
||||||
|
else:
|
||||||
|
info['hls_formats'] = []
|
||||||
|
|
||||||
|
# check for 403. Unnecessary for tor video routing b/c ip address is same
|
||||||
|
info['invidious_used'] = False
|
||||||
|
info['invidious_reload_button'] = False
|
||||||
|
info['tor_bypass_used'] = False
|
||||||
|
return info
|
||||||
|
|
||||||
|
def video_quality_string(format):
|
||||||
|
if format['vcodec']:
|
||||||
|
result =str(format['width'] or '?') + 'x' + str(format['height'] or '?')
|
||||||
|
if format['fps']:
|
||||||
|
result += ' ' + str(format['fps']) + 'fps'
|
||||||
|
return result
|
||||||
|
elif format['acodec']:
|
||||||
|
return 'audio only'
|
||||||
|
|
||||||
|
return '?'
|
||||||
|
|
||||||
|
def audio_quality_string(format):
|
||||||
|
if format['acodec']:
|
||||||
|
result = str(format['audio_bitrate'] or '?') + 'k'
|
||||||
|
if format['audio_sample_rate']:
|
||||||
|
result += ' ' + str(format['audio_sample_rate']) + ' Hz'
|
||||||
|
return result
|
||||||
|
elif format['vcodec']:
|
||||||
|
return 'video only'
|
||||||
|
|
||||||
|
return '?'
|
||||||
|
|
||||||
|
# from https://github.com/ytdl-org/youtube-dl/blob/master/youtube_dl/utils.py
|
||||||
|
def format_bytes(bytes):
|
||||||
|
if bytes is None:
|
||||||
|
return 'N/A'
|
||||||
|
if type(bytes) is str:
|
||||||
|
bytes = float(bytes)
|
||||||
|
if bytes == 0.0:
|
||||||
|
exponent = 0
|
||||||
|
else:
|
||||||
|
exponent = int(math.log(bytes, 1024.0))
|
||||||
|
suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
|
||||||
|
converted = float(bytes) / float(1024 ** exponent)
|
||||||
|
return '%.2f%s' % (converted, suffix)
|
||||||
|
|
||||||
|
|
12
youtube/yt_data_extract/__init__.py
Normal file
12
youtube/yt_data_extract/__init__.py
Normal file
@ -0,0 +1,12 @@
|
|||||||
|
from .common import (get, multi_get, deep_get, multi_deep_get,
|
||||||
|
liberal_update, conservative_update, remove_redirect, normalize_url,
|
||||||
|
extract_str, extract_formatted_text, extract_int, extract_approx_int,
|
||||||
|
extract_date, extract_item_info, extract_items, extract_response)
|
||||||
|
|
||||||
|
from .everything_else import (extract_channel_info, extract_search_info,
|
||||||
|
extract_playlist_metadata, extract_playlist_info, extract_comments_info)
|
||||||
|
|
||||||
|
from .watch_extraction import (extract_watch_info, get_caption_url,
|
||||||
|
update_with_age_restricted_info, requires_decryption,
|
||||||
|
extract_decryption_function, decrypt_signatures, _formats,
|
||||||
|
update_format_with_type_info, extract_hls_formats)
|
470
youtube/yt_data_extract/common.py
Normal file
470
youtube/yt_data_extract/common.py
Normal file
@ -0,0 +1,470 @@
|
|||||||
|
import re
|
||||||
|
import urllib.parse
|
||||||
|
import collections
|
||||||
|
|
||||||
|
def get(object, key, default=None, types=()):
|
||||||
|
'''Like dict.get(), but returns default if the result doesn't match one of the types.
|
||||||
|
Also works for indexing lists.'''
|
||||||
|
try:
|
||||||
|
result = object[key]
|
||||||
|
except (TypeError, IndexError, KeyError):
|
||||||
|
return default
|
||||||
|
|
||||||
|
if not types or isinstance(result, types):
|
||||||
|
return result
|
||||||
|
else:
|
||||||
|
return default
|
||||||
|
|
||||||
|
def multi_get(object, *keys, default=None, types=()):
|
||||||
|
'''Like get, but try other keys if the first fails'''
|
||||||
|
for key in keys:
|
||||||
|
try:
|
||||||
|
result = object[key]
|
||||||
|
except (TypeError, IndexError, KeyError):
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
if not types or isinstance(result, types):
|
||||||
|
return result
|
||||||
|
else:
|
||||||
|
continue
|
||||||
|
return default
|
||||||
|
|
||||||
|
|
||||||
|
def deep_get(object, *keys, default=None, types=()):
|
||||||
|
'''Like dict.get(), but for nested dictionaries/sequences, supporting keys or indices.
|
||||||
|
Last argument is the default value to use in case of any IndexErrors or KeyErrors.
|
||||||
|
If types is given and the result doesn't match one of those types, default is returned'''
|
||||||
|
try:
|
||||||
|
for key in keys:
|
||||||
|
object = object[key]
|
||||||
|
except (TypeError, IndexError, KeyError):
|
||||||
|
return default
|
||||||
|
else:
|
||||||
|
if not types or isinstance(object, types):
|
||||||
|
return object
|
||||||
|
else:
|
||||||
|
return default
|
||||||
|
|
||||||
|
def multi_deep_get(object, *key_sequences, default=None, types=()):
|
||||||
|
'''Like deep_get, but can try different key sequences in case one fails.
|
||||||
|
Return default if all of them fail. key_sequences is a list of lists'''
|
||||||
|
for key_sequence in key_sequences:
|
||||||
|
_object = object
|
||||||
|
try:
|
||||||
|
for key in key_sequence:
|
||||||
|
_object = _object[key]
|
||||||
|
except (TypeError, IndexError, KeyError):
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
if not types or isinstance(_object, types):
|
||||||
|
return _object
|
||||||
|
else:
|
||||||
|
continue
|
||||||
|
return default
|
||||||
|
|
||||||
|
def liberal_update(obj, key, value):
|
||||||
|
'''Updates obj[key] with value as long as value is not None.
|
||||||
|
Ensures obj[key] will at least get a value of None, however'''
|
||||||
|
if (value is not None) or (key not in obj):
|
||||||
|
obj[key] = value
|
||||||
|
|
||||||
|
def conservative_update(obj, key, value):
|
||||||
|
'''Only updates obj if it doesn't have key or obj[key] is None'''
|
||||||
|
if obj.get(key) is None:
|
||||||
|
obj[key] = value
|
||||||
|
|
||||||
|
def concat_or_none(*strings):
|
||||||
|
'''Concatenates strings. Returns None if any of the arguments are None'''
|
||||||
|
result = ''
|
||||||
|
for string in strings:
|
||||||
|
if string is None:
|
||||||
|
return None
|
||||||
|
result += string
|
||||||
|
return result
|
||||||
|
|
||||||
|
def remove_redirect(url):
|
||||||
|
if url is None:
|
||||||
|
return None
|
||||||
|
if re.fullmatch(r'(((https?:)?//)?(www.)?youtube.com)?/redirect\?.*', url) is not None: # youtube puts these on external links to do tracking
|
||||||
|
query_string = url[url.find('?')+1: ]
|
||||||
|
return urllib.parse.parse_qs(query_string)['q'][0]
|
||||||
|
return url
|
||||||
|
|
||||||
|
youtube_url_re = re.compile(r'^(?:(?:(?:https?:)?//)?(?:www\.)?youtube\.com)?(/.*)$')
|
||||||
|
def normalize_url(url):
|
||||||
|
if url is None:
|
||||||
|
return None
|
||||||
|
match = youtube_url_re.fullmatch(url)
|
||||||
|
if match is None:
|
||||||
|
raise Exception()
|
||||||
|
|
||||||
|
return 'https://www.youtube.com' + match.group(1)
|
||||||
|
|
||||||
|
def _recover_urls(runs):
|
||||||
|
for run in runs:
|
||||||
|
url = deep_get(run, 'navigationEndpoint', 'urlEndpoint', 'url')
|
||||||
|
text = run.get('text', '')
|
||||||
|
# second condition is necessary because youtube makes other things into urls, such as hashtags, which we want to keep as text
|
||||||
|
if url is not None and (text.startswith('http://') or text.startswith('https://')):
|
||||||
|
url = remove_redirect(url)
|
||||||
|
run['url'] = url
|
||||||
|
run['text'] = url # youtube truncates the url text, use actual url instead
|
||||||
|
|
||||||
|
def extract_str(node, default=None, recover_urls=False):
|
||||||
|
'''default is the value returned if the extraction fails. If recover_urls is true, will attempt to fix Youtube's truncation of url text (most prominently seen in descriptions)'''
|
||||||
|
if isinstance(node, str):
|
||||||
|
return node
|
||||||
|
|
||||||
|
try:
|
||||||
|
return node['simpleText']
|
||||||
|
except (KeyError, TypeError):
|
||||||
|
pass
|
||||||
|
|
||||||
|
if isinstance(node, dict) and 'runs' in node:
|
||||||
|
if recover_urls:
|
||||||
|
_recover_urls(node['runs'])
|
||||||
|
return ''.join(text_run.get('text', '') for text_run in node['runs'])
|
||||||
|
|
||||||
|
return default
|
||||||
|
|
||||||
|
def extract_formatted_text(node):
|
||||||
|
if not node:
|
||||||
|
return []
|
||||||
|
if 'runs' in node:
|
||||||
|
_recover_urls(node['runs'])
|
||||||
|
return node['runs']
|
||||||
|
elif 'simpleText' in node:
|
||||||
|
return [{'text': node['simpleText']}]
|
||||||
|
return []
|
||||||
|
|
||||||
|
def extract_int(string, default=None):
|
||||||
|
if isinstance(string, int):
|
||||||
|
return string
|
||||||
|
if not isinstance(string, str):
|
||||||
|
string = extract_str(string)
|
||||||
|
if not string:
|
||||||
|
return default
|
||||||
|
match = re.search(r'\b(\d+)\b', string.replace(',', ''))
|
||||||
|
if match is None:
|
||||||
|
return default
|
||||||
|
try:
|
||||||
|
return int(match.group(1))
|
||||||
|
except ValueError:
|
||||||
|
return default
|
||||||
|
|
||||||
|
def extract_approx_int(string):
|
||||||
|
'''e.g. "15.1M" from "15.1M subscribers"'''
|
||||||
|
if not isinstance(string, str):
|
||||||
|
string = extract_str(string)
|
||||||
|
if not string:
|
||||||
|
return None
|
||||||
|
match = re.search(r'\b(\d+(?:\.\d+)?[KMBTkmbt]?)\b', string.replace(',', ''))
|
||||||
|
if match is None:
|
||||||
|
return None
|
||||||
|
return match.group(1)
|
||||||
|
|
||||||
|
MONTH_ABBREVIATIONS = {'jan':'1', 'feb':'2', 'mar':'3', 'apr':'4', 'may':'5', 'jun':'6', 'jul':'7', 'aug':'8', 'sep':'9', 'oct':'10', 'nov':'11', 'dec':'12'}
|
||||||
|
def extract_date(date_text):
|
||||||
|
'''Input: "Mar 9, 2019". Output: "2019-3-9"'''
|
||||||
|
if not isinstance(date_text, str):
|
||||||
|
date_text = extract_str(date_text)
|
||||||
|
if date_text is None:
|
||||||
|
return None
|
||||||
|
|
||||||
|
date_text = date_text.replace(',', '').lower()
|
||||||
|
parts = date_text.split()
|
||||||
|
if len(parts) >= 3:
|
||||||
|
month, day, year = parts[-3:]
|
||||||
|
month = MONTH_ABBREVIATIONS.get(month[0:3]) # slicing in case they start writing out the full month name
|
||||||
|
if month and (re.fullmatch(r'\d\d?', day) is not None) and (re.fullmatch(r'\d{4}', year) is not None):
|
||||||
|
return year + '-' + month + '-' + day
|
||||||
|
return None
|
||||||
|
|
||||||
|
def check_missing_keys(object, *key_sequences):
|
||||||
|
for key_sequence in key_sequences:
|
||||||
|
_object = object
|
||||||
|
try:
|
||||||
|
for key in key_sequence:
|
||||||
|
_object = _object[key]
|
||||||
|
except (KeyError, IndexError, TypeError):
|
||||||
|
return 'Could not find ' + key
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
def extract_item_info(item, additional_info={}):
|
||||||
|
if not item:
|
||||||
|
return {'error': 'No item given'}
|
||||||
|
|
||||||
|
type = get(list(item.keys()), 0)
|
||||||
|
if not type:
|
||||||
|
return {'error': 'Could not find type'}
|
||||||
|
item = item[type]
|
||||||
|
|
||||||
|
info = {'error': None}
|
||||||
|
if type in ('itemSectionRenderer', 'compactAutoplayRenderer'):
|
||||||
|
return extract_item_info(deep_get(item, 'contents', 0), additional_info)
|
||||||
|
|
||||||
|
if type in ('movieRenderer', 'clarificationRenderer'):
|
||||||
|
info['type'] = 'unsupported'
|
||||||
|
return info
|
||||||
|
|
||||||
|
info.update(additional_info)
|
||||||
|
|
||||||
|
# type looks like e.g. 'compactVideoRenderer' or 'gridVideoRenderer'
|
||||||
|
# camelCase split, https://stackoverflow.com/a/37697078
|
||||||
|
type_parts = [s.lower() for s in re.sub(r'([A-Z][a-z]+)', r' \1', type).split()]
|
||||||
|
if len(type_parts) < 2:
|
||||||
|
info['type'] = 'unsupported'
|
||||||
|
return
|
||||||
|
primary_type = type_parts[-2]
|
||||||
|
if primary_type == 'video':
|
||||||
|
info['type'] = 'video'
|
||||||
|
elif primary_type in ('playlist', 'radio', 'show'):
|
||||||
|
info['type'] = 'playlist'
|
||||||
|
elif primary_type == 'channel':
|
||||||
|
info['type'] = 'channel'
|
||||||
|
elif type == 'videoWithContextRenderer': # stupid exception
|
||||||
|
info['type'] = 'video'
|
||||||
|
primary_type = 'video'
|
||||||
|
else:
|
||||||
|
info['type'] = 'unsupported'
|
||||||
|
|
||||||
|
# videoWithContextRenderer changes it to 'headline' just to be annoying
|
||||||
|
info['title'] = extract_str(multi_get(item, 'title', 'headline'))
|
||||||
|
if primary_type != 'channel':
|
||||||
|
info['author'] = extract_str(multi_get(item, 'longBylineText', 'shortBylineText', 'ownerText'))
|
||||||
|
info['author_id'] = extract_str(multi_deep_get(item,
|
||||||
|
['longBylineText', 'runs', 0, 'navigationEndpoint', 'browseEndpoint', 'browseId'],
|
||||||
|
['shortBylineText', 'runs', 0, 'navigationEndpoint', 'browseEndpoint', 'browseId'],
|
||||||
|
['ownerText', 'runs', 0, 'navigationEndpoint', 'browseEndpoint', 'browseId']
|
||||||
|
))
|
||||||
|
info['author_url'] = ('https://www.youtube.com/channel/' + info['author_id']) if info['author_id'] else None
|
||||||
|
info['description'] = extract_formatted_text(multi_get(item, 'descriptionSnippet', 'descriptionText'))
|
||||||
|
info['thumbnail'] = multi_deep_get(item,
|
||||||
|
['thumbnail', 'thumbnails', 0, 'url'], # videos
|
||||||
|
['thumbnails', 0, 'thumbnails', 0, 'url'], # playlists
|
||||||
|
['thumbnailRenderer', 'showCustomThumbnailRenderer', 'thumbnail', 'thumbnails', 0, 'url'], # shows
|
||||||
|
)
|
||||||
|
|
||||||
|
info['badges'] = []
|
||||||
|
for badge_node in multi_get(item, 'badges', 'ownerBadges', default=()):
|
||||||
|
badge = deep_get(badge_node, 'metadataBadgeRenderer', 'label')
|
||||||
|
if badge:
|
||||||
|
info['badges'].append(badge)
|
||||||
|
|
||||||
|
if primary_type in ('video', 'playlist'):
|
||||||
|
info['time_published'] = None
|
||||||
|
timestamp = re.search(r'(\d+ \w+ ago)',
|
||||||
|
extract_str(item.get('publishedTimeText'), default=''))
|
||||||
|
if timestamp:
|
||||||
|
info['time_published'] = timestamp.group(1)
|
||||||
|
|
||||||
|
if primary_type == 'video':
|
||||||
|
info['id'] = item.get('videoId')
|
||||||
|
info['view_count'] = extract_int(item.get('viewCountText'))
|
||||||
|
|
||||||
|
# dig into accessibility data to get view_count for videos marked as recommended, and to get time_published
|
||||||
|
accessibility_label = multi_deep_get(item,
|
||||||
|
['title', 'accessibility', 'accessibilityData', 'label'],
|
||||||
|
['headline', 'accessibility', 'accessibilityData', 'label'],
|
||||||
|
default='')
|
||||||
|
timestamp = re.search(r'(\d+ \w+ ago)', accessibility_label)
|
||||||
|
if timestamp:
|
||||||
|
conservative_update(info, 'time_published', timestamp.group(1))
|
||||||
|
view_count = re.search(r'(\d+) views', accessibility_label.replace(',', ''))
|
||||||
|
if view_count:
|
||||||
|
conservative_update(info, 'view_count', int(view_count.group(1)))
|
||||||
|
|
||||||
|
if info['view_count']:
|
||||||
|
info['approx_view_count'] = '{:,}'.format(info['view_count'])
|
||||||
|
else:
|
||||||
|
info['approx_view_count'] = extract_approx_int(item.get('shortViewCountText'))
|
||||||
|
|
||||||
|
# handle case where it is "No views"
|
||||||
|
if not info['approx_view_count']:
|
||||||
|
if ('No views' in item.get('shortViewCountText', '')
|
||||||
|
or 'no views' in accessibility_label.lower()):
|
||||||
|
info['view_count'] = 0
|
||||||
|
info['approx_view_count'] = '0'
|
||||||
|
|
||||||
|
info['duration'] = extract_str(item.get('lengthText'))
|
||||||
|
|
||||||
|
# if it's an item in a playlist, get its index
|
||||||
|
if 'index' in item: # url has wrong index on playlist page
|
||||||
|
info['index'] = extract_int(item.get('index'))
|
||||||
|
elif 'indexText' in item:
|
||||||
|
# Current item in playlist has ▶ instead of the actual index, must
|
||||||
|
# dig into url
|
||||||
|
match = re.search(r'index=(\d+)', deep_get(item,
|
||||||
|
'navigationEndpoint', 'commandMetadata', 'webCommandMetadata',
|
||||||
|
'url', default=''))
|
||||||
|
if match is None: # worth a try then
|
||||||
|
info['index'] = extract_int(item.get('indexText'))
|
||||||
|
else:
|
||||||
|
info['index'] = int(match.group(1))
|
||||||
|
else:
|
||||||
|
info['index'] = None
|
||||||
|
|
||||||
|
elif primary_type in ('playlist', 'radio'):
|
||||||
|
info['id'] = item.get('playlistId')
|
||||||
|
info['video_count'] = extract_int(item.get('videoCount'))
|
||||||
|
elif primary_type == 'channel':
|
||||||
|
info['id'] = item.get('channelId')
|
||||||
|
info['approx_subscriber_count'] = extract_approx_int(item.get('subscriberCountText'))
|
||||||
|
elif primary_type == 'show':
|
||||||
|
info['id'] = deep_get(item, 'navigationEndpoint', 'watchEndpoint', 'playlistId')
|
||||||
|
|
||||||
|
if primary_type in ('playlist', 'channel'):
|
||||||
|
conservative_update(info, 'video_count', extract_int(item.get('videoCountText')))
|
||||||
|
|
||||||
|
for overlay in item.get('thumbnailOverlays', []):
|
||||||
|
conservative_update(info, 'duration', extract_str(deep_get(
|
||||||
|
overlay, 'thumbnailOverlayTimeStatusRenderer', 'text'
|
||||||
|
)))
|
||||||
|
# show renderers don't have videoCountText
|
||||||
|
conservative_update(info, 'video_count', extract_int(deep_get(
|
||||||
|
overlay, 'thumbnailOverlayBottomPanelRenderer', 'text'
|
||||||
|
)))
|
||||||
|
return info
|
||||||
|
|
||||||
|
def extract_response(polymer_json):
|
||||||
|
'''return response, error'''
|
||||||
|
response = multi_deep_get(polymer_json, [1, 'response'], ['response'])
|
||||||
|
if response is None:
|
||||||
|
return None, 'Failed to extract response'
|
||||||
|
else:
|
||||||
|
return response, None
|
||||||
|
|
||||||
|
|
||||||
|
_item_types = {
|
||||||
|
'movieRenderer',
|
||||||
|
'didYouMeanRenderer',
|
||||||
|
'showingResultsForRenderer',
|
||||||
|
|
||||||
|
'videoRenderer',
|
||||||
|
'compactVideoRenderer',
|
||||||
|
'compactAutoplayRenderer',
|
||||||
|
'videoWithContextRenderer',
|
||||||
|
'gridVideoRenderer',
|
||||||
|
'playlistVideoRenderer',
|
||||||
|
|
||||||
|
'playlistRenderer',
|
||||||
|
'compactPlaylistRenderer',
|
||||||
|
'gridPlaylistRenderer',
|
||||||
|
|
||||||
|
'radioRenderer',
|
||||||
|
'compactRadioRenderer',
|
||||||
|
'gridRadioRenderer',
|
||||||
|
|
||||||
|
'showRenderer',
|
||||||
|
'compactShowRenderer',
|
||||||
|
'gridShowRenderer',
|
||||||
|
|
||||||
|
|
||||||
|
'channelRenderer',
|
||||||
|
'compactChannelRenderer',
|
||||||
|
'gridChannelRenderer',
|
||||||
|
}
|
||||||
|
|
||||||
|
def _traverse_browse_renderer(renderer):
|
||||||
|
for tab in get(renderer, 'tabs', ()):
|
||||||
|
tab_renderer = multi_get(tab, 'tabRenderer', 'expandableTabRenderer')
|
||||||
|
if tab_renderer is None:
|
||||||
|
continue
|
||||||
|
if tab_renderer.get('selected', False):
|
||||||
|
return get(tab_renderer, 'content', {})
|
||||||
|
print('Could not find tab with content')
|
||||||
|
return {}
|
||||||
|
|
||||||
|
def _traverse_standard_list(renderer):
|
||||||
|
renderer_list = multi_get(renderer, 'contents', 'items', default=())
|
||||||
|
continuation = deep_get(renderer, 'continuations', 0, 'nextContinuationData', 'continuation')
|
||||||
|
return renderer_list, continuation
|
||||||
|
|
||||||
|
# these renderers contain one inside them
|
||||||
|
nested_renderer_dispatch = {
|
||||||
|
'singleColumnBrowseResultsRenderer': _traverse_browse_renderer,
|
||||||
|
'twoColumnBrowseResultsRenderer': _traverse_browse_renderer,
|
||||||
|
'twoColumnSearchResultsRenderer': lambda renderer: get(renderer, 'primaryContents', {}),
|
||||||
|
}
|
||||||
|
|
||||||
|
# these renderers contain a list of renderers inside them
|
||||||
|
nested_renderer_list_dispatch = {
|
||||||
|
'sectionListRenderer': _traverse_standard_list,
|
||||||
|
'itemSectionRenderer': _traverse_standard_list,
|
||||||
|
'gridRenderer': _traverse_standard_list,
|
||||||
|
'playlistVideoListRenderer': _traverse_standard_list,
|
||||||
|
'singleColumnWatchNextResults': lambda r: (deep_get(r, 'results', 'results', 'contents', default=[]), None),
|
||||||
|
}
|
||||||
|
def get_nested_renderer_list_function(key):
|
||||||
|
if key in nested_renderer_list_dispatch:
|
||||||
|
return nested_renderer_list_dispatch[key]
|
||||||
|
elif key.endswith('Continuation'):
|
||||||
|
return _traverse_standard_list
|
||||||
|
return None
|
||||||
|
|
||||||
|
def extract_items_from_renderer(renderer, item_types=_item_types):
|
||||||
|
ctoken = None
|
||||||
|
items = []
|
||||||
|
|
||||||
|
iter_stack = collections.deque()
|
||||||
|
current_iter = iter(())
|
||||||
|
|
||||||
|
while True:
|
||||||
|
# mode 1: get a new renderer by iterating.
|
||||||
|
# goes down the stack for an iterator if one has been exhausted
|
||||||
|
if not renderer:
|
||||||
|
try:
|
||||||
|
renderer = current_iter.__next__()
|
||||||
|
except StopIteration:
|
||||||
|
try:
|
||||||
|
current_iter = iter_stack.pop()
|
||||||
|
except IndexError:
|
||||||
|
return items, ctoken
|
||||||
|
# Get new renderer or check that the one we got is good before
|
||||||
|
# proceeding to mode 2
|
||||||
|
continue
|
||||||
|
|
||||||
|
|
||||||
|
# mode 2: dig into the current renderer
|
||||||
|
key, value = list(renderer.items())[0]
|
||||||
|
|
||||||
|
# the renderer is an item
|
||||||
|
if key in item_types:
|
||||||
|
items.append(renderer)
|
||||||
|
|
||||||
|
# has a list in it, add it to the iter stack
|
||||||
|
elif get_nested_renderer_list_function(key):
|
||||||
|
renderer_list, cont = get_nested_renderer_list_function(key)(value)
|
||||||
|
if renderer_list:
|
||||||
|
iter_stack.append(current_iter)
|
||||||
|
current_iter = iter(renderer_list)
|
||||||
|
if cont:
|
||||||
|
ctoken = cont
|
||||||
|
|
||||||
|
# new renderer nested inside this one
|
||||||
|
elif key in nested_renderer_dispatch:
|
||||||
|
renderer = nested_renderer_dispatch[key](value)
|
||||||
|
continue # don't reset renderer to None
|
||||||
|
|
||||||
|
renderer = None
|
||||||
|
|
||||||
|
def extract_items(response, item_types=_item_types):
|
||||||
|
'''return items, ctoken'''
|
||||||
|
if 'continuationContents' in response:
|
||||||
|
# sometimes there's another, empty, junk [something]Continuation key
|
||||||
|
# find real one
|
||||||
|
for key, renderer_cont in get(response,
|
||||||
|
'continuationContents', {}).items():
|
||||||
|
# e.g. commentSectionContinuation, playlistVideoListContinuation
|
||||||
|
if key.endswith('Continuation'):
|
||||||
|
items, cont = extract_items_from_renderer({key: renderer_cont},
|
||||||
|
item_types=item_types)
|
||||||
|
if items:
|
||||||
|
return items, cont
|
||||||
|
return [], None
|
||||||
|
elif 'contents' in response:
|
||||||
|
renderer = get(response, 'contents', {})
|
||||||
|
return extract_items_from_renderer(renderer, item_types=item_types)
|
||||||
|
else:
|
||||||
|
return [], None
|
281
youtube/yt_data_extract/everything_else.py
Normal file
281
youtube/yt_data_extract/everything_else.py
Normal file
@ -0,0 +1,281 @@
|
|||||||
|
from .common import (get, multi_get, deep_get, multi_deep_get,
|
||||||
|
liberal_update, conservative_update, remove_redirect, normalize_url,
|
||||||
|
extract_str, extract_formatted_text, extract_int, extract_approx_int,
|
||||||
|
extract_date, check_missing_keys, extract_item_info, extract_items,
|
||||||
|
extract_response)
|
||||||
|
from youtube import proto
|
||||||
|
|
||||||
|
import re
|
||||||
|
import urllib
|
||||||
|
from math import ceil
|
||||||
|
|
||||||
|
def extract_channel_info(polymer_json, tab):
|
||||||
|
response, err = extract_response(polymer_json)
|
||||||
|
if err:
|
||||||
|
return {'error': err}
|
||||||
|
|
||||||
|
|
||||||
|
metadata = deep_get(response, 'metadata', 'channelMetadataRenderer',
|
||||||
|
default={})
|
||||||
|
if not metadata:
|
||||||
|
metadata = deep_get(response, 'microformat', 'microformatDataRenderer',
|
||||||
|
default={})
|
||||||
|
|
||||||
|
# channel doesn't exist or was terminated
|
||||||
|
# example terminated channel: https://www.youtube.com/channel/UCnKJeK_r90jDdIuzHXC0Org
|
||||||
|
if not metadata:
|
||||||
|
if response.get('alerts'):
|
||||||
|
error_string = ' '.join(
|
||||||
|
extract_str(deep_get(alert, 'alertRenderer', 'text'), default='')
|
||||||
|
for alert in response['alerts']
|
||||||
|
)
|
||||||
|
if not error_string:
|
||||||
|
error_string = 'Failed to extract error'
|
||||||
|
return {'error': error_string}
|
||||||
|
elif deep_get(response, 'responseContext', 'errors'):
|
||||||
|
for error in response['responseContext']['errors'].get('error', []):
|
||||||
|
if error.get('code') == 'INVALID_VALUE' and error.get('location') == 'browse_id':
|
||||||
|
return {'error': 'This channel does not exist'}
|
||||||
|
return {'error': 'Failure getting metadata'}
|
||||||
|
|
||||||
|
info = {'error': None}
|
||||||
|
info['current_tab'] = tab
|
||||||
|
|
||||||
|
info['approx_subscriber_count'] = extract_approx_int(deep_get(response,
|
||||||
|
'header', 'c4TabbedHeaderRenderer', 'subscriberCountText'))
|
||||||
|
|
||||||
|
# stuff from microformat (info given by youtube for every page on channel)
|
||||||
|
info['short_description'] = metadata.get('description')
|
||||||
|
if info['short_description'] and len(info['short_description']) > 730:
|
||||||
|
info['short_description'] = info['short_description'][0:730] + '...'
|
||||||
|
info['channel_name'] = metadata.get('title')
|
||||||
|
info['avatar'] = multi_deep_get(metadata,
|
||||||
|
['avatar', 'thumbnails', 0, 'url'],
|
||||||
|
['thumbnail', 'thumbnails', 0, 'url'],
|
||||||
|
)
|
||||||
|
channel_url = multi_get(metadata, 'urlCanonical', 'channelUrl')
|
||||||
|
if channel_url:
|
||||||
|
channel_id = get(channel_url.rstrip('/').split('/'), -1)
|
||||||
|
info['channel_id'] = channel_id
|
||||||
|
else:
|
||||||
|
info['channel_id'] = metadata.get('externalId')
|
||||||
|
if info['channel_id']:
|
||||||
|
info['channel_url'] = 'https://www.youtube.com/channel/' + channel_id
|
||||||
|
else:
|
||||||
|
info['channel_url'] = None
|
||||||
|
|
||||||
|
# get items
|
||||||
|
info['items'] = []
|
||||||
|
|
||||||
|
# empty channel
|
||||||
|
if 'contents' not in response and 'continuationContents' not in response:
|
||||||
|
return info
|
||||||
|
|
||||||
|
if tab in ('videos', 'playlists', 'search'):
|
||||||
|
items, ctoken = extract_items(response)
|
||||||
|
additional_info = {'author': info['channel_name'], 'author_url': info['channel_url']}
|
||||||
|
info['items'] = [extract_item_info(renderer, additional_info) for renderer in items]
|
||||||
|
if tab == 'search':
|
||||||
|
info['is_last_page'] = (ctoken is None)
|
||||||
|
elif tab == 'about':
|
||||||
|
items, _ = extract_items(response, item_types={'channelAboutFullMetadataRenderer'})
|
||||||
|
if not items:
|
||||||
|
info['error'] = 'Could not find channelAboutFullMetadataRenderer'
|
||||||
|
return info
|
||||||
|
channel_metadata = items[0]['channelAboutFullMetadataRenderer']
|
||||||
|
|
||||||
|
info['links'] = []
|
||||||
|
for link_json in channel_metadata.get('primaryLinks', ()):
|
||||||
|
url = remove_redirect(deep_get(link_json, 'navigationEndpoint', 'urlEndpoint', 'url'))
|
||||||
|
text = extract_str(link_json.get('title'))
|
||||||
|
info['links'].append( (text, url) )
|
||||||
|
|
||||||
|
info['date_joined'] = extract_date(channel_metadata.get('joinedDateText'))
|
||||||
|
info['view_count'] = extract_int(channel_metadata.get('viewCountText'))
|
||||||
|
info['description'] = extract_str(channel_metadata.get('description'), default='')
|
||||||
|
else:
|
||||||
|
raise NotImplementedError('Unknown or unsupported channel tab: ' + tab)
|
||||||
|
|
||||||
|
return info
|
||||||
|
|
||||||
|
def extract_search_info(polymer_json):
|
||||||
|
response, err = extract_response(polymer_json)
|
||||||
|
if err:
|
||||||
|
return {'error': err}
|
||||||
|
info = {'error': None}
|
||||||
|
info['estimated_results'] = int(response['estimatedResults'])
|
||||||
|
info['estimated_pages'] = ceil(info['estimated_results']/20)
|
||||||
|
|
||||||
|
|
||||||
|
results, _ = extract_items(response)
|
||||||
|
|
||||||
|
|
||||||
|
info['items'] = []
|
||||||
|
info['corrections'] = {'type': None}
|
||||||
|
for renderer in results:
|
||||||
|
type = list(renderer.keys())[0]
|
||||||
|
if type == 'shelfRenderer':
|
||||||
|
continue
|
||||||
|
if type == 'didYouMeanRenderer':
|
||||||
|
renderer = renderer[type]
|
||||||
|
|
||||||
|
info['corrections'] = {
|
||||||
|
'type': 'did_you_mean',
|
||||||
|
'corrected_query': renderer['correctedQueryEndpoint']['searchEndpoint']['query'],
|
||||||
|
'corrected_query_text': renderer['correctedQuery']['runs'],
|
||||||
|
}
|
||||||
|
continue
|
||||||
|
if type == 'showingResultsForRenderer':
|
||||||
|
renderer = renderer[type]
|
||||||
|
|
||||||
|
info['corrections'] = {
|
||||||
|
'type': 'showing_results_for',
|
||||||
|
'corrected_query_text': renderer['correctedQuery']['runs'],
|
||||||
|
'original_query_text': renderer['originalQuery']['simpleText'],
|
||||||
|
}
|
||||||
|
continue
|
||||||
|
|
||||||
|
i_info = extract_item_info(renderer)
|
||||||
|
if i_info.get('type') != 'unsupported':
|
||||||
|
info['items'].append(i_info)
|
||||||
|
|
||||||
|
|
||||||
|
return info
|
||||||
|
|
||||||
|
def extract_playlist_metadata(polymer_json):
|
||||||
|
response, err = extract_response(polymer_json)
|
||||||
|
if err:
|
||||||
|
return {'error': err}
|
||||||
|
|
||||||
|
metadata = {'error': None}
|
||||||
|
header = deep_get(response, 'header', 'playlistHeaderRenderer', default={})
|
||||||
|
metadata['title'] = extract_str(header.get('title'))
|
||||||
|
|
||||||
|
metadata['first_video_id'] = deep_get(header, 'playEndpoint', 'watchEndpoint', 'videoId')
|
||||||
|
first_id = re.search(r'([a-z_\-]{11})', deep_get(header,
|
||||||
|
'thumbnail', 'thumbnails', 0, 'url', default=''))
|
||||||
|
if first_id:
|
||||||
|
conservative_update(metadata, 'first_video_id', first_id.group(1))
|
||||||
|
if metadata['first_video_id'] is None:
|
||||||
|
metadata['thumbnail'] = None
|
||||||
|
else:
|
||||||
|
metadata['thumbnail'] = 'https://i.ytimg.com/vi/' + metadata['first_video_id'] + '/mqdefault.jpg'
|
||||||
|
|
||||||
|
metadata['video_count'] = extract_int(header.get('numVideosText'))
|
||||||
|
metadata['description'] = extract_str(header.get('descriptionText'), default='')
|
||||||
|
metadata['author'] = extract_str(header.get('ownerText'))
|
||||||
|
metadata['author_id'] = multi_deep_get(header,
|
||||||
|
['ownerText', 'runs', 0, 'navigationEndpoint', 'browseEndpoint', 'browseId'],
|
||||||
|
['ownerEndpoint', 'browseEndpoint', 'browseId'])
|
||||||
|
if metadata['author_id']:
|
||||||
|
metadata['author_url'] = 'https://www.youtube.com/channel/' + metadata['author_id']
|
||||||
|
else:
|
||||||
|
metadata['author_url'] = None
|
||||||
|
metadata['view_count'] = extract_int(header.get('viewCountText'))
|
||||||
|
metadata['like_count'] = extract_int(header.get('likesCountWithoutLikeText'))
|
||||||
|
for stat in header.get('stats', ()):
|
||||||
|
text = extract_str(stat)
|
||||||
|
if 'videos' in text:
|
||||||
|
conservative_update(metadata, 'video_count', extract_int(text))
|
||||||
|
elif 'views' in text:
|
||||||
|
conservative_update(metadata, 'view_count', extract_int(text))
|
||||||
|
elif 'updated' in text:
|
||||||
|
metadata['time_published'] = extract_date(text)
|
||||||
|
|
||||||
|
return metadata
|
||||||
|
|
||||||
|
def extract_playlist_info(polymer_json):
|
||||||
|
response, err = extract_response(polymer_json)
|
||||||
|
if err:
|
||||||
|
return {'error': err}
|
||||||
|
info = {'error': None}
|
||||||
|
first_page = 'continuationContents' not in response
|
||||||
|
video_list, _ = extract_items(response)
|
||||||
|
|
||||||
|
info['items'] = [extract_item_info(renderer) for renderer in video_list]
|
||||||
|
|
||||||
|
if first_page:
|
||||||
|
info['metadata'] = extract_playlist_metadata(polymer_json)
|
||||||
|
|
||||||
|
return info
|
||||||
|
|
||||||
|
def _ctoken_metadata(ctoken):
|
||||||
|
result = dict()
|
||||||
|
params = proto.parse(proto.b64_to_bytes(ctoken))
|
||||||
|
result['video_id'] = proto.parse(params[2])[2].decode('ascii')
|
||||||
|
|
||||||
|
offset_information = proto.parse(params[6])
|
||||||
|
result['offset'] = offset_information.get(5, 0)
|
||||||
|
|
||||||
|
result['is_replies'] = False
|
||||||
|
if (3 in offset_information) and (2 in proto.parse(offset_information[3])):
|
||||||
|
result['is_replies'] = True
|
||||||
|
result['sort'] = None
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
result['sort'] = proto.parse(offset_information[4])[6]
|
||||||
|
except KeyError:
|
||||||
|
result['sort'] = 0
|
||||||
|
return result
|
||||||
|
|
||||||
|
def extract_comments_info(polymer_json):
|
||||||
|
response, err = extract_response(polymer_json)
|
||||||
|
if err:
|
||||||
|
return {'error': err}
|
||||||
|
info = {'error': None}
|
||||||
|
|
||||||
|
url = multi_deep_get(polymer_json, [1, 'url'], ['url'])
|
||||||
|
if url:
|
||||||
|
ctoken = urllib.parse.parse_qs(url[url.find('?')+1:])['ctoken'][0]
|
||||||
|
metadata = _ctoken_metadata(ctoken)
|
||||||
|
else:
|
||||||
|
metadata = {}
|
||||||
|
info['video_id'] = metadata.get('video_id')
|
||||||
|
info['offset'] = metadata.get('offset')
|
||||||
|
info['is_replies'] = metadata.get('is_replies')
|
||||||
|
info['sort'] = metadata.get('sort')
|
||||||
|
info['video_title'] = None
|
||||||
|
|
||||||
|
comments, ctoken = extract_items(response,
|
||||||
|
item_types={'commentThreadRenderer', 'commentRenderer'})
|
||||||
|
info['comments'] = []
|
||||||
|
info['ctoken'] = ctoken
|
||||||
|
for comment in comments:
|
||||||
|
comment_info = {}
|
||||||
|
|
||||||
|
if 'commentThreadRenderer' in comment: # top level comments
|
||||||
|
conservative_update(info, 'is_replies', False)
|
||||||
|
comment_thread = comment['commentThreadRenderer']
|
||||||
|
info['video_title'] = extract_str(comment_thread.get('commentTargetTitle'))
|
||||||
|
if 'replies' not in comment_thread:
|
||||||
|
comment_info['reply_count'] = 0
|
||||||
|
else:
|
||||||
|
comment_info['reply_count'] = extract_int(deep_get(comment_thread,
|
||||||
|
'replies', 'commentRepliesRenderer', 'moreText'
|
||||||
|
), default=1) # With 1 reply, the text reads "View reply"
|
||||||
|
comment_renderer = deep_get(comment_thread, 'comment', 'commentRenderer', default={})
|
||||||
|
elif 'commentRenderer' in comment: # replies
|
||||||
|
comment_info['reply_count'] = 0 # replyCount, below, not present for replies even if the reply has further replies to it
|
||||||
|
conservative_update(info, 'is_replies', True)
|
||||||
|
comment_renderer = comment['commentRenderer']
|
||||||
|
else:
|
||||||
|
comment_renderer = {}
|
||||||
|
|
||||||
|
# These 3 are sometimes absent, likely because the channel was deleted
|
||||||
|
comment_info['author'] = extract_str(comment_renderer.get('authorText'))
|
||||||
|
comment_info['author_url'] = deep_get(comment_renderer,
|
||||||
|
'authorEndpoint', 'commandMetadata', 'webCommandMetadata', 'url')
|
||||||
|
comment_info['author_id'] = deep_get(comment_renderer,
|
||||||
|
'authorEndpoint', 'browseEndpoint', 'browseId')
|
||||||
|
|
||||||
|
comment_info['author_avatar'] = deep_get(comment_renderer,
|
||||||
|
'authorThumbnail', 'thumbnails', 0, 'url')
|
||||||
|
comment_info['id'] = comment_renderer.get('commentId')
|
||||||
|
comment_info['text'] = extract_formatted_text(comment_renderer.get('contentText'))
|
||||||
|
comment_info['time_published'] = extract_str(comment_renderer.get('publishedTimeText'))
|
||||||
|
comment_info['like_count'] = comment_renderer.get('likeCount')
|
||||||
|
liberal_update(comment_info, 'reply_count', comment_renderer.get('replyCount'))
|
||||||
|
|
||||||
|
info['comments'].append(comment_info)
|
||||||
|
|
||||||
|
return info
|
689
youtube/yt_data_extract/watch_extraction.py
Normal file
689
youtube/yt_data_extract/watch_extraction.py
Normal file
@ -0,0 +1,689 @@
|
|||||||
|
from .common import (get, multi_get, deep_get, multi_deep_get,
|
||||||
|
liberal_update, conservative_update, remove_redirect, normalize_url,
|
||||||
|
extract_str, extract_formatted_text, extract_int, extract_approx_int,
|
||||||
|
extract_date, check_missing_keys, extract_item_info, extract_items,
|
||||||
|
extract_response, concat_or_none)
|
||||||
|
|
||||||
|
import json
|
||||||
|
import urllib.parse
|
||||||
|
import traceback
|
||||||
|
import re
|
||||||
|
|
||||||
|
# from https://github.com/ytdl-org/youtube-dl/blob/master/youtube_dl/extractor/youtube.py
|
||||||
|
_formats = {
|
||||||
|
'5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'audio_bitrate': 64, 'vcodec': 'h263'},
|
||||||
|
'6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'audio_bitrate': 64, 'vcodec': 'h263'},
|
||||||
|
'13': {'ext': '3gp', 'acodec': 'aac', 'vcodec': 'mp4v'},
|
||||||
|
'17': {'ext': '3gp', 'width': 176, 'height': 144, 'acodec': 'aac', 'audio_bitrate': 24, 'vcodec': 'mp4v'},
|
||||||
|
'18': {'ext': 'mp4', 'width': 640, 'height': 360, 'acodec': 'aac', 'audio_bitrate': 96, 'vcodec': 'h264'},
|
||||||
|
'22': {'ext': 'mp4', 'width': 1280, 'height': 720, 'acodec': 'aac', 'audio_bitrate': 192, 'vcodec': 'h264'},
|
||||||
|
'34': {'ext': 'flv', 'width': 640, 'height': 360, 'acodec': 'aac', 'audio_bitrate': 128, 'vcodec': 'h264'},
|
||||||
|
'35': {'ext': 'flv', 'width': 854, 'height': 480, 'acodec': 'aac', 'audio_bitrate': 128, 'vcodec': 'h264'},
|
||||||
|
# itag 36 videos are either 320x180 (BaW_jenozKc) or 320x240 (__2ABJjxzNo), audio_bitrate varies as well
|
||||||
|
'36': {'ext': '3gp', 'width': 320, 'acodec': 'aac', 'vcodec': 'mp4v'},
|
||||||
|
'37': {'ext': 'mp4', 'width': 1920, 'height': 1080, 'acodec': 'aac', 'audio_bitrate': 192, 'vcodec': 'h264'},
|
||||||
|
'38': {'ext': 'mp4', 'width': 4096, 'height': 3072, 'acodec': 'aac', 'audio_bitrate': 192, 'vcodec': 'h264'},
|
||||||
|
'43': {'ext': 'webm', 'width': 640, 'height': 360, 'acodec': 'vorbis', 'audio_bitrate': 128, 'vcodec': 'vp8'},
|
||||||
|
'44': {'ext': 'webm', 'width': 854, 'height': 480, 'acodec': 'vorbis', 'audio_bitrate': 128, 'vcodec': 'vp8'},
|
||||||
|
'45': {'ext': 'webm', 'width': 1280, 'height': 720, 'acodec': 'vorbis', 'audio_bitrate': 192, 'vcodec': 'vp8'},
|
||||||
|
'46': {'ext': 'webm', 'width': 1920, 'height': 1080, 'acodec': 'vorbis', 'audio_bitrate': 192, 'vcodec': 'vp8'},
|
||||||
|
'59': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'audio_bitrate': 128, 'vcodec': 'h264'},
|
||||||
|
'78': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'audio_bitrate': 128, 'vcodec': 'h264'},
|
||||||
|
|
||||||
|
|
||||||
|
# 3D videos
|
||||||
|
'82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'acodec': 'aac', 'audio_bitrate': 128, 'vcodec': 'h264'},
|
||||||
|
'83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'acodec': 'aac', 'audio_bitrate': 128, 'vcodec': 'h264'},
|
||||||
|
'84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'acodec': 'aac', 'audio_bitrate': 192, 'vcodec': 'h264'},
|
||||||
|
'85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'acodec': 'aac', 'audio_bitrate': 192, 'vcodec': 'h264'},
|
||||||
|
'100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'acodec': 'vorbis', 'audio_bitrate': 128, 'vcodec': 'vp8'},
|
||||||
|
'101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'acodec': 'vorbis', 'audio_bitrate': 192, 'vcodec': 'vp8'},
|
||||||
|
'102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'audio_bitrate': 192, 'vcodec': 'vp8'},
|
||||||
|
|
||||||
|
# Apple HTTP Live Streaming
|
||||||
|
'91': {'ext': 'mp4', 'height': 144, 'format_note': 'HLS', 'acodec': 'aac', 'audio_bitrate': 48, 'vcodec': 'h264'},
|
||||||
|
'92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'audio_bitrate': 48, 'vcodec': 'h264'},
|
||||||
|
'93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'audio_bitrate': 128, 'vcodec': 'h264'},
|
||||||
|
'94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'audio_bitrate': 128, 'vcodec': 'h264'},
|
||||||
|
'95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'acodec': 'aac', 'audio_bitrate': 256, 'vcodec': 'h264'},
|
||||||
|
'96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'acodec': 'aac', 'audio_bitrate': 256, 'vcodec': 'h264'},
|
||||||
|
'132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'audio_bitrate': 48, 'vcodec': 'h264'},
|
||||||
|
'151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'audio_bitrate': 24, 'vcodec': 'h264'},
|
||||||
|
|
||||||
|
# DASH mp4 video
|
||||||
|
'133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264'},
|
||||||
|
'134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'h264'},
|
||||||
|
'135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
|
||||||
|
'136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264'},
|
||||||
|
'137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264'},
|
||||||
|
'138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264'}, # Height can vary (https://github.com/ytdl-org/youtube-dl/issues/4559)
|
||||||
|
'160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264'},
|
||||||
|
'212': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
|
||||||
|
'264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264'},
|
||||||
|
'298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
|
||||||
|
'299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
|
||||||
|
'266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264'},
|
||||||
|
|
||||||
|
# Dash mp4 audio
|
||||||
|
'139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'audio_bitrate': 48, 'container': 'm4a_dash'},
|
||||||
|
'140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'audio_bitrate': 128, 'container': 'm4a_dash'},
|
||||||
|
'141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'audio_bitrate': 256, 'container': 'm4a_dash'},
|
||||||
|
'256': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
|
||||||
|
'258': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
|
||||||
|
'325': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'dtse', 'container': 'm4a_dash'},
|
||||||
|
'328': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'ec-3', 'container': 'm4a_dash'},
|
||||||
|
|
||||||
|
# Dash webm
|
||||||
|
'167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
|
||||||
|
'168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
|
||||||
|
'169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
|
||||||
|
'170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
|
||||||
|
'218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
|
||||||
|
'219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
|
||||||
|
'278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp9'},
|
||||||
|
'242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'vp9'},
|
||||||
|
'243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'vp9'},
|
||||||
|
'244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
|
||||||
|
'245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
|
||||||
|
'246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
|
||||||
|
'247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9'},
|
||||||
|
'248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9'},
|
||||||
|
'271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9'},
|
||||||
|
# itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug)
|
||||||
|
'272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
|
||||||
|
'302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
|
||||||
|
'303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
|
||||||
|
'308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
|
||||||
|
'313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
|
||||||
|
'315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
|
||||||
|
|
||||||
|
# Dash webm audio
|
||||||
|
'171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'audio_bitrate': 128},
|
||||||
|
'172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'audio_bitrate': 256},
|
||||||
|
|
||||||
|
# Dash webm audio with opus inside
|
||||||
|
'249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'audio_bitrate': 50},
|
||||||
|
'250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'audio_bitrate': 70},
|
||||||
|
'251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'audio_bitrate': 160},
|
||||||
|
|
||||||
|
# RTMP (unnamed)
|
||||||
|
'_rtmp': {'protocol': 'rtmp'},
|
||||||
|
|
||||||
|
# av01 video only formats sometimes served with "unknown" codecs
|
||||||
|
'394': {'vcodec': 'av01.0.05M.08'},
|
||||||
|
'395': {'vcodec': 'av01.0.05M.08'},
|
||||||
|
'396': {'vcodec': 'av01.0.05M.08'},
|
||||||
|
'397': {'vcodec': 'av01.0.05M.08'},
|
||||||
|
}
|
||||||
|
|
||||||
|
def _extract_metadata_row_info(video_renderer_info):
|
||||||
|
# extract category and music list
|
||||||
|
info = {
|
||||||
|
'category': None,
|
||||||
|
'music_list': [],
|
||||||
|
}
|
||||||
|
|
||||||
|
current_song = {}
|
||||||
|
for row in deep_get(video_renderer_info, 'metadataRowContainer', 'metadataRowContainerRenderer', 'rows', default=[]):
|
||||||
|
row_title = extract_str(deep_get(row, 'metadataRowRenderer', 'title'), default='')
|
||||||
|
row_content = extract_str(deep_get(row, 'metadataRowRenderer', 'contents', 0))
|
||||||
|
if row_title == 'Category':
|
||||||
|
info['category'] = row_content
|
||||||
|
elif row_title in ('Song', 'Music'):
|
||||||
|
if current_song:
|
||||||
|
info['music_list'].append(current_song)
|
||||||
|
current_song = {'title': row_content}
|
||||||
|
elif row_title == 'Artist':
|
||||||
|
current_song['artist'] = row_content
|
||||||
|
elif row_title == 'Album':
|
||||||
|
current_song['album'] = row_content
|
||||||
|
elif row_title == 'Writers':
|
||||||
|
current_song['writers'] = row_content
|
||||||
|
elif row_title.startswith('Licensed'):
|
||||||
|
current_song['licensor'] = row_content
|
||||||
|
if current_song:
|
||||||
|
info['music_list'].append(current_song)
|
||||||
|
|
||||||
|
return info
|
||||||
|
|
||||||
|
def _extract_watch_info_mobile(top_level):
|
||||||
|
info = {}
|
||||||
|
microformat = deep_get(top_level, 'playerResponse', 'microformat', 'playerMicroformatRenderer', default={})
|
||||||
|
|
||||||
|
family_safe = microformat.get('isFamilySafe')
|
||||||
|
if family_safe is None:
|
||||||
|
info['age_restricted'] = None
|
||||||
|
else:
|
||||||
|
info['age_restricted'] = not family_safe
|
||||||
|
info['allowed_countries'] = microformat.get('availableCountries', [])
|
||||||
|
info['time_published'] = microformat.get('publishDate')
|
||||||
|
|
||||||
|
response = top_level.get('response', {})
|
||||||
|
|
||||||
|
# this renderer has the stuff visible on the page
|
||||||
|
# check for playlist
|
||||||
|
items, _ = extract_items(response,
|
||||||
|
item_types={'singleColumnWatchNextResults'})
|
||||||
|
if items:
|
||||||
|
watch_next_results = items[0]['singleColumnWatchNextResults']
|
||||||
|
playlist = deep_get(watch_next_results, 'playlist', 'playlist')
|
||||||
|
if playlist is None:
|
||||||
|
info['playlist'] = None
|
||||||
|
else:
|
||||||
|
info['playlist'] = {}
|
||||||
|
info['playlist']['title'] = playlist.get('title')
|
||||||
|
info['playlist']['author'] = extract_str(multi_get(playlist,
|
||||||
|
'ownerName', 'longBylineText', 'shortBylineText', 'ownerText'))
|
||||||
|
author_id = deep_get(playlist, 'longBylineText', 'runs', 0,
|
||||||
|
'navigationEndpoint', 'browseEndpoint', 'browseId')
|
||||||
|
info['playlist']['author_id'] = author_id
|
||||||
|
if author_id:
|
||||||
|
info['playlist']['author_url'] = concat_or_none(
|
||||||
|
'https://www.youtube.com/channel/', author_id)
|
||||||
|
info['playlist']['id'] = playlist.get('playlistId')
|
||||||
|
info['playlist']['url'] = concat_or_none(
|
||||||
|
'https://www.youtube.com/playlist?list=',
|
||||||
|
info['playlist']['id'])
|
||||||
|
info['playlist']['video_count'] = playlist.get('totalVideos')
|
||||||
|
info['playlist']['current_index'] = playlist.get('currentIndex')
|
||||||
|
info['playlist']['items'] = [
|
||||||
|
extract_item_info(i) for i in playlist.get('contents', ())]
|
||||||
|
else:
|
||||||
|
info['playlist'] = None
|
||||||
|
|
||||||
|
# Holds the visible video info. It is inside singleColumnWatchNextResults
|
||||||
|
# but use our convenience function instead
|
||||||
|
items, _ = extract_items(response, item_types={'slimVideoMetadataRenderer'})
|
||||||
|
if items:
|
||||||
|
video_info = items[0]['slimVideoMetadataRenderer']
|
||||||
|
else:
|
||||||
|
print('Failed to extract video metadata')
|
||||||
|
video_info = {}
|
||||||
|
|
||||||
|
info.update(_extract_metadata_row_info(video_info))
|
||||||
|
info['description'] = extract_str(video_info.get('description'), recover_urls=True)
|
||||||
|
info['view_count'] = extract_int(extract_str(video_info.get('expandedSubtitle')))
|
||||||
|
info['author'] = extract_str(deep_get(video_info, 'owner', 'slimOwnerRenderer', 'title'))
|
||||||
|
info['author_id'] = deep_get(video_info, 'owner', 'slimOwnerRenderer', 'navigationEndpoint', 'browseEndpoint', 'browseId')
|
||||||
|
info['title'] = extract_str(video_info.get('title'))
|
||||||
|
info['live'] = 'watching' in extract_str(video_info.get('expandedSubtitle'), default='')
|
||||||
|
info['unlisted'] = False
|
||||||
|
for badge in video_info.get('badges', []):
|
||||||
|
if deep_get(badge, 'metadataBadgeRenderer', 'label') == 'Unlisted':
|
||||||
|
info['unlisted'] = True
|
||||||
|
info['like_count'] = None
|
||||||
|
info['dislike_count'] = None
|
||||||
|
if not info['time_published']:
|
||||||
|
info['time_published'] = extract_date(extract_str(video_info.get('dateText', None)))
|
||||||
|
for button in video_info.get('buttons', ()):
|
||||||
|
button_renderer = button.get('slimMetadataToggleButtonRenderer', {})
|
||||||
|
|
||||||
|
# all the digits can be found in the accessibility data
|
||||||
|
count = extract_int(deep_get(button_renderer, 'button', 'toggleButtonRenderer', 'defaultText', 'accessibility', 'accessibilityData', 'label'))
|
||||||
|
|
||||||
|
# this count doesn't have all the digits, it's like 53K for instance
|
||||||
|
dumb_count = extract_int(extract_str(deep_get(button_renderer, 'button', 'toggleButtonRenderer', 'defaultText')))
|
||||||
|
|
||||||
|
# the accessibility text will be "No likes" or "No dislikes" or something like that, but dumb count will be 0
|
||||||
|
if dumb_count == 0:
|
||||||
|
count = 0
|
||||||
|
|
||||||
|
if 'isLike' in button_renderer:
|
||||||
|
info['like_count'] = count
|
||||||
|
elif 'isDislike' in button_renderer:
|
||||||
|
info['dislike_count'] = count
|
||||||
|
|
||||||
|
# comment section info
|
||||||
|
items, _ = extract_items(response, item_types={
|
||||||
|
'commentSectionRenderer', 'commentsEntryPointHeaderRenderer'})
|
||||||
|
if items:
|
||||||
|
header_type = list(items[0])[0]
|
||||||
|
comment_info = items[0][header_type]
|
||||||
|
# This seems to be some kind of A/B test being done on mobile, where
|
||||||
|
# this is present instead of the normal commentSectionRenderer. It can
|
||||||
|
# be seen here:
|
||||||
|
# https://www.androidpolice.com/2019/10/31/google-youtube-app-comment-section-below-videos/
|
||||||
|
# https://www.youtube.com/watch?v=bR5Q-wD-6qo
|
||||||
|
if header_type == 'commentsEntryPointHeaderRenderer':
|
||||||
|
comment_count_text = extract_str(comment_info.get('headerText'))
|
||||||
|
else:
|
||||||
|
comment_count_text = extract_str(deep_get(comment_info,
|
||||||
|
'header', 'commentSectionHeaderRenderer', 'countText'))
|
||||||
|
if comment_count_text == 'Comments': # just this with no number, means 0 comments
|
||||||
|
info['comment_count'] = 0
|
||||||
|
else:
|
||||||
|
info['comment_count'] = extract_int(comment_count_text)
|
||||||
|
info['comments_disabled'] = False
|
||||||
|
else: # no comment section present means comments are disabled
|
||||||
|
info['comment_count'] = 0
|
||||||
|
info['comments_disabled'] = True
|
||||||
|
|
||||||
|
# check for limited state
|
||||||
|
items, _ = extract_items(response, item_types={'limitedStateMessageRenderer'})
|
||||||
|
if items:
|
||||||
|
info['limited_state'] = True
|
||||||
|
else:
|
||||||
|
info['limited_state'] = False
|
||||||
|
|
||||||
|
# related videos
|
||||||
|
related, _ = extract_items(response)
|
||||||
|
info['related_videos'] = [extract_item_info(renderer) for renderer in related]
|
||||||
|
|
||||||
|
return info
|
||||||
|
|
||||||
|
def _extract_watch_info_desktop(top_level):
|
||||||
|
info = {
|
||||||
|
'comment_count': None,
|
||||||
|
'comments_disabled': None,
|
||||||
|
'allowed_countries': [],
|
||||||
|
'limited_state': None,
|
||||||
|
'playlist': None,
|
||||||
|
}
|
||||||
|
|
||||||
|
video_info = {}
|
||||||
|
for renderer in deep_get(top_level, 'response', 'contents', 'twoColumnWatchNextResults', 'results', 'results', 'contents', default=()):
|
||||||
|
if renderer and list(renderer.keys())[0] in ('videoPrimaryInfoRenderer', 'videoSecondaryInfoRenderer'):
|
||||||
|
video_info.update(list(renderer.values())[0])
|
||||||
|
|
||||||
|
info.update(_extract_metadata_row_info(video_info))
|
||||||
|
info['description'] = extract_str(video_info.get('description', None), recover_urls=True)
|
||||||
|
info['time_published'] = extract_date(extract_str(video_info.get('dateText', None)))
|
||||||
|
|
||||||
|
likes_dislikes = deep_get(video_info, 'sentimentBar', 'sentimentBarRenderer', 'tooltip', default='').split('/')
|
||||||
|
if len(likes_dislikes) == 2:
|
||||||
|
info['like_count'] = extract_int(likes_dislikes[0])
|
||||||
|
info['dislike_count'] = extract_int(likes_dislikes[1])
|
||||||
|
else:
|
||||||
|
info['like_count'] = None
|
||||||
|
info['dislike_count'] = None
|
||||||
|
|
||||||
|
info['title'] = extract_str(video_info.get('title', None))
|
||||||
|
info['author'] = extract_str(deep_get(video_info, 'owner', 'videoOwnerRenderer', 'title'))
|
||||||
|
info['author_id'] = deep_get(video_info, 'owner', 'videoOwnerRenderer', 'navigationEndpoint', 'browseEndpoint', 'browseId')
|
||||||
|
info['view_count'] = extract_int(extract_str(deep_get(video_info, 'viewCount', 'videoViewCountRenderer', 'viewCount')))
|
||||||
|
|
||||||
|
related = deep_get(top_level, 'response', 'contents', 'twoColumnWatchNextResults', 'secondaryResults', 'secondaryResults', 'results', default=[])
|
||||||
|
info['related_videos'] = [extract_item_info(renderer) for renderer in related]
|
||||||
|
|
||||||
|
return info
|
||||||
|
|
||||||
|
def update_format_with_codec_info(fmt, codec):
|
||||||
|
if (codec.startswith('av')
|
||||||
|
or codec in ('vp9', 'vp8', 'vp8.0', 'h263', 'h264', 'mp4v')):
|
||||||
|
if codec == 'vp8.0':
|
||||||
|
codec = 'vp8'
|
||||||
|
conservative_update(fmt, 'vcodec', codec)
|
||||||
|
elif (codec.startswith('mp4a')
|
||||||
|
or codec in ('opus', 'mp3', 'aac', 'dtse', 'ec-3', 'vorbis')):
|
||||||
|
conservative_update(fmt, 'acodec', codec)
|
||||||
|
else:
|
||||||
|
print('Warning: unrecognized codec: ' + codec)
|
||||||
|
|
||||||
|
fmt_type_re = re.compile(
|
||||||
|
r'(text|audio|video)/([\w0-9]+); codecs="([\w0-9\.]+(?:, [\w0-9\.]+)*)"')
|
||||||
|
def update_format_with_type_info(fmt, yt_fmt):
|
||||||
|
# 'type' for invidious api format
|
||||||
|
mime_type = multi_get(yt_fmt, 'mimeType', 'type')
|
||||||
|
if mime_type is None:
|
||||||
|
return
|
||||||
|
match = re.fullmatch(fmt_type_re, mime_type)
|
||||||
|
|
||||||
|
type, fmt['ext'], codecs = match.groups()
|
||||||
|
codecs = codecs.split(', ')
|
||||||
|
for codec in codecs:
|
||||||
|
update_format_with_codec_info(fmt, codec)
|
||||||
|
if type == 'audio':
|
||||||
|
assert len(codecs) == 1
|
||||||
|
|
||||||
|
def _extract_formats(info, player_response):
|
||||||
|
streaming_data = player_response.get('streamingData', {})
|
||||||
|
yt_formats = streaming_data.get('formats', []) + streaming_data.get('adaptiveFormats', [])
|
||||||
|
|
||||||
|
info['formats'] = []
|
||||||
|
# because we may retry the extract_formats with a different player_response
|
||||||
|
# so keep what we have
|
||||||
|
conservative_update(info, 'hls_manifest_url',
|
||||||
|
streaming_data.get('hlsManifestUrl'))
|
||||||
|
conservative_update(info, 'dash_manifest_url',
|
||||||
|
streaming_data.get('dash_manifest_url'))
|
||||||
|
|
||||||
|
for yt_fmt in yt_formats:
|
||||||
|
itag = yt_fmt.get('itag')
|
||||||
|
|
||||||
|
fmt = {}
|
||||||
|
fmt['itag'] = itag
|
||||||
|
fmt['ext'] = None
|
||||||
|
fmt['audio_bitrate'] = None
|
||||||
|
fmt['acodec'] = None
|
||||||
|
fmt['vcodec'] = None
|
||||||
|
fmt['width'] = yt_fmt.get('width')
|
||||||
|
fmt['height'] = yt_fmt.get('height')
|
||||||
|
fmt['file_size'] = yt_fmt.get('contentLength')
|
||||||
|
fmt['audio_sample_rate'] = yt_fmt.get('audioSampleRate')
|
||||||
|
fmt['fps'] = yt_fmt.get('fps')
|
||||||
|
update_format_with_type_info(fmt, yt_fmt)
|
||||||
|
cipher = dict(urllib.parse.parse_qsl(multi_get(yt_fmt,
|
||||||
|
'cipher', 'signatureCipher', default='')))
|
||||||
|
if cipher:
|
||||||
|
fmt['url'] = cipher.get('url')
|
||||||
|
else:
|
||||||
|
fmt['url'] = yt_fmt.get('url')
|
||||||
|
fmt['s'] = cipher.get('s')
|
||||||
|
fmt['sp'] = cipher.get('sp')
|
||||||
|
|
||||||
|
# update with information from big table
|
||||||
|
hardcoded_itag_info = _formats.get(str(itag), {})
|
||||||
|
for key, value in hardcoded_itag_info.items():
|
||||||
|
conservative_update(fmt, key, value) # prefer info from Youtube
|
||||||
|
fmt['quality'] = hardcoded_itag_info.get('height')
|
||||||
|
|
||||||
|
info['formats'].append(fmt)
|
||||||
|
|
||||||
|
# get ip address
|
||||||
|
if info['formats']:
|
||||||
|
query_string = (info['formats'][0].get('url') or '?').split('?')[1]
|
||||||
|
info['ip_address'] = deep_get(
|
||||||
|
urllib.parse.parse_qs(query_string), 'ip', 0)
|
||||||
|
else:
|
||||||
|
info['ip_address'] = None
|
||||||
|
|
||||||
|
hls_regex = re.compile(r'[\w_-]+=(?:"[^"]+"|[^",]+),')
|
||||||
|
def extract_hls_formats(hls_manifest):
|
||||||
|
'''returns hls_formats, err'''
|
||||||
|
hls_formats = []
|
||||||
|
try:
|
||||||
|
lines = hls_manifest.splitlines()
|
||||||
|
i = 0
|
||||||
|
while i < len(lines):
|
||||||
|
if lines[i].startswith('#EXT-X-STREAM-INF'):
|
||||||
|
fmt = {'acodec': None, 'vcodec': None, 'height': None,
|
||||||
|
'width': None, 'fps': None, 'audio_bitrate': None,
|
||||||
|
'itag': None, 'file_size': None,
|
||||||
|
'audio_sample_rate': None, 'url': None}
|
||||||
|
properties = lines[i].split(':')[1]
|
||||||
|
properties += ',' # make regex work for last key-value pair
|
||||||
|
|
||||||
|
for pair in hls_regex.findall(properties):
|
||||||
|
key, value = pair.rstrip(',').split('=')
|
||||||
|
if key == 'CODECS':
|
||||||
|
for codec in value.strip('"').split(','):
|
||||||
|
update_format_with_codec_info(fmt, codec)
|
||||||
|
elif key == 'RESOLUTION':
|
||||||
|
fmt['width'], fmt['height'] = map(int, value.split('x'))
|
||||||
|
fmt['resolution'] = value
|
||||||
|
elif key == 'FRAME-RATE':
|
||||||
|
fmt['fps'] = int(value)
|
||||||
|
i += 1
|
||||||
|
fmt['url'] = lines[i]
|
||||||
|
assert fmt['url'].startswith('http')
|
||||||
|
fmt['ext'] = 'm3u8'
|
||||||
|
hls_formats.append(fmt)
|
||||||
|
i += 1
|
||||||
|
except Exception as e:
|
||||||
|
traceback.print_exc()
|
||||||
|
return [], str(e)
|
||||||
|
return hls_formats, None
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_playability_error(info, player_response, error_prefix=''):
|
||||||
|
if info['formats']:
|
||||||
|
info['playability_status'] = None
|
||||||
|
info['playability_error'] = None
|
||||||
|
return
|
||||||
|
|
||||||
|
playability_status = deep_get(player_response, 'playabilityStatus', 'status', default=None)
|
||||||
|
info['playability_status'] = playability_status
|
||||||
|
|
||||||
|
playability_reason = extract_str(multi_deep_get(player_response,
|
||||||
|
['playabilityStatus', 'reason'],
|
||||||
|
['playabilityStatus', 'errorScreen', 'playerErrorMessageRenderer', 'reason'],
|
||||||
|
default='Could not find playability error')
|
||||||
|
)
|
||||||
|
|
||||||
|
if playability_status not in (None, 'OK'):
|
||||||
|
info['playability_error'] = error_prefix + playability_reason
|
||||||
|
elif not info['playability_error']: # do not override
|
||||||
|
info['playability_error'] = error_prefix + 'Unknown playability error'
|
||||||
|
|
||||||
|
SUBTITLE_FORMATS = ('srv1', 'srv2', 'srv3', 'ttml', 'vtt')
|
||||||
|
def extract_watch_info(polymer_json):
|
||||||
|
info = {'playability_error': None, 'error': None}
|
||||||
|
|
||||||
|
if isinstance(polymer_json, dict):
|
||||||
|
top_level = polymer_json
|
||||||
|
elif isinstance(polymer_json, (list, tuple)):
|
||||||
|
top_level = {}
|
||||||
|
for page_part in polymer_json:
|
||||||
|
if not isinstance(page_part, dict):
|
||||||
|
return {'error': 'Invalid page part'}
|
||||||
|
top_level.update(page_part)
|
||||||
|
else:
|
||||||
|
return {'error': 'Invalid top level polymer data'}
|
||||||
|
|
||||||
|
error = check_missing_keys(top_level,
|
||||||
|
['player', 'args'],
|
||||||
|
['player', 'assets', 'js'],
|
||||||
|
['playerResponse'],
|
||||||
|
)
|
||||||
|
if error:
|
||||||
|
info['playability_error'] = error
|
||||||
|
|
||||||
|
player_response = top_level.get('playerResponse', {})
|
||||||
|
|
||||||
|
# usually, only the embedded one has the urls
|
||||||
|
player_args = deep_get(top_level, 'player', 'args', default={})
|
||||||
|
if 'player_response' in player_args:
|
||||||
|
embedded_player_response = json.loads(player_args['player_response'])
|
||||||
|
else:
|
||||||
|
embedded_player_response = {}
|
||||||
|
|
||||||
|
# captions
|
||||||
|
info['automatic_caption_languages'] = []
|
||||||
|
info['manual_caption_languages'] = []
|
||||||
|
info['_manual_caption_language_names'] = {} # language name written in that language, needed in some cases to create the url
|
||||||
|
info['translation_languages'] = []
|
||||||
|
captions_info = player_response.get('captions', {})
|
||||||
|
info['_captions_base_url'] = normalize_url(deep_get(captions_info, 'playerCaptionsRenderer', 'baseUrl'))
|
||||||
|
for caption_track in deep_get(captions_info, 'playerCaptionsTracklistRenderer', 'captionTracks', default=()):
|
||||||
|
lang_code = caption_track.get('languageCode')
|
||||||
|
if not lang_code:
|
||||||
|
continue
|
||||||
|
if caption_track.get('kind') == 'asr':
|
||||||
|
info['automatic_caption_languages'].append(lang_code)
|
||||||
|
else:
|
||||||
|
info['manual_caption_languages'].append(lang_code)
|
||||||
|
base_url = caption_track.get('baseUrl', '')
|
||||||
|
lang_name = deep_get(urllib.parse.parse_qs(urllib.parse.urlparse(base_url).query), 'name', 0)
|
||||||
|
if lang_name:
|
||||||
|
info['_manual_caption_language_names'][lang_code] = lang_name
|
||||||
|
|
||||||
|
for translation_lang_info in deep_get(captions_info, 'playerCaptionsTracklistRenderer', 'translationLanguages', default=()):
|
||||||
|
lang_code = translation_lang_info.get('languageCode')
|
||||||
|
if lang_code:
|
||||||
|
info['translation_languages'].append(lang_code)
|
||||||
|
if translation_lang_info.get('isTranslatable') == False:
|
||||||
|
print('WARNING: Found non-translatable caption language')
|
||||||
|
|
||||||
|
# formats
|
||||||
|
_extract_formats(info, embedded_player_response)
|
||||||
|
if not info['formats']:
|
||||||
|
_extract_formats(info, player_response)
|
||||||
|
|
||||||
|
# playability errors
|
||||||
|
_extract_playability_error(info, player_response)
|
||||||
|
|
||||||
|
# check age-restriction
|
||||||
|
info['age_restricted'] = (info['playability_status'] == 'LOGIN_REQUIRED' and info['playability_error'] and ' age' in info['playability_error'])
|
||||||
|
|
||||||
|
# base_js (for decryption of signatures)
|
||||||
|
info['base_js'] = deep_get(top_level, 'player', 'assets', 'js')
|
||||||
|
if info['base_js']:
|
||||||
|
info['base_js'] = normalize_url(info['base_js'])
|
||||||
|
# must uniquely identify url
|
||||||
|
info['player_name'] = urllib.parse.urlparse(info['base_js']).path
|
||||||
|
else:
|
||||||
|
info['player_name'] = None
|
||||||
|
|
||||||
|
# extract stuff from visible parts of page
|
||||||
|
mobile = 'singleColumnWatchNextResults' in deep_get(top_level, 'response', 'contents', default={})
|
||||||
|
if mobile:
|
||||||
|
info.update(_extract_watch_info_mobile(top_level))
|
||||||
|
else:
|
||||||
|
info.update(_extract_watch_info_desktop(top_level))
|
||||||
|
|
||||||
|
# stuff from videoDetails. Use liberal_update to prioritize info from videoDetails over existing info
|
||||||
|
vd = deep_get(top_level, 'playerResponse', 'videoDetails', default={})
|
||||||
|
liberal_update(info, 'title', extract_str(vd.get('title')))
|
||||||
|
liberal_update(info, 'duration', extract_int(vd.get('lengthSeconds')))
|
||||||
|
liberal_update(info, 'view_count', extract_int(vd.get('viewCount')))
|
||||||
|
# videos with no description have a blank string
|
||||||
|
liberal_update(info, 'description', vd.get('shortDescription'))
|
||||||
|
liberal_update(info, 'id', vd.get('videoId'))
|
||||||
|
liberal_update(info, 'author', vd.get('author'))
|
||||||
|
liberal_update(info, 'author_id', vd.get('channelId'))
|
||||||
|
info['was_live'] = vd.get('isLiveContent')
|
||||||
|
conservative_update(info, 'unlisted', not vd.get('isCrawlable', True)) #isCrawlable is false on limited state videos even if they aren't unlisted
|
||||||
|
liberal_update(info, 'tags', vd.get('keywords', []))
|
||||||
|
|
||||||
|
# fallback stuff from microformat
|
||||||
|
mf = deep_get(top_level, 'playerResponse', 'microformat', 'playerMicroformatRenderer', default={})
|
||||||
|
conservative_update(info, 'title', extract_str(mf.get('title')))
|
||||||
|
conservative_update(info, 'duration', extract_int(mf.get('lengthSeconds')))
|
||||||
|
# this gives the view count for limited state videos
|
||||||
|
conservative_update(info, 'view_count', extract_int(mf.get('viewCount')))
|
||||||
|
conservative_update(info, 'description', extract_str(mf.get('description'), recover_urls=True))
|
||||||
|
conservative_update(info, 'author', mf.get('ownerChannelName'))
|
||||||
|
conservative_update(info, 'author_id', mf.get('externalChannelId'))
|
||||||
|
conservative_update(info, 'live', deep_get(mf, 'liveBroadcastDetails',
|
||||||
|
'isLiveNow'))
|
||||||
|
liberal_update(info, 'unlisted', mf.get('isUnlisted'))
|
||||||
|
liberal_update(info, 'category', mf.get('category'))
|
||||||
|
liberal_update(info, 'time_published', mf.get('publishDate'))
|
||||||
|
liberal_update(info, 'time_uploaded', mf.get('uploadDate'))
|
||||||
|
|
||||||
|
# other stuff
|
||||||
|
info['author_url'] = 'https://www.youtube.com/channel/' + info['author_id'] if info['author_id'] else None
|
||||||
|
return info
|
||||||
|
|
||||||
|
def get_caption_url(info, language, format, automatic=False, translation_language=None):
|
||||||
|
'''Gets the url for captions with the given language and format. If automatic is True, get the automatic captions for that language. If translation_language is given, translate the captions from `language` to `translation_language`. If automatic is true and translation_language is given, the automatic captions will be translated.'''
|
||||||
|
url = info['_captions_base_url']
|
||||||
|
url += '&lang=' + language
|
||||||
|
url += '&fmt=' + format
|
||||||
|
if automatic:
|
||||||
|
url += '&kind=asr'
|
||||||
|
elif language in info['_manual_caption_language_names']:
|
||||||
|
url += '&name=' + urllib.parse.quote(info['_manual_caption_language_names'][language], safe='')
|
||||||
|
|
||||||
|
if translation_language:
|
||||||
|
url += '&tlang=' + translation_language
|
||||||
|
return url
|
||||||
|
|
||||||
|
def update_with_age_restricted_info(info, video_info_page):
|
||||||
|
ERROR_PREFIX = 'Error bypassing age-restriction: '
|
||||||
|
|
||||||
|
video_info = urllib.parse.parse_qs(video_info_page)
|
||||||
|
player_response = deep_get(video_info, 'player_response', 0)
|
||||||
|
if player_response is None:
|
||||||
|
info['playability_error'] = ERROR_PREFIX + 'Could not find player_response in video_info_page'
|
||||||
|
return
|
||||||
|
try:
|
||||||
|
player_response = json.loads(player_response)
|
||||||
|
except json.decoder.JSONDecodeError:
|
||||||
|
traceback.print_exc()
|
||||||
|
info['playability_error'] = ERROR_PREFIX + 'Failed to parse json response'
|
||||||
|
return
|
||||||
|
|
||||||
|
_extract_formats(info, player_response)
|
||||||
|
_extract_playability_error(info, player_response, error_prefix=ERROR_PREFIX)
|
||||||
|
|
||||||
|
def requires_decryption(info):
|
||||||
|
return ('formats' in info) and info['formats'] and info['formats'][0]['s']
|
||||||
|
|
||||||
|
# adapted from youtube-dl and invidious:
|
||||||
|
# https://github.com/omarroth/invidious/blob/master/src/invidious/helpers/signatures.cr
|
||||||
|
decrypt_function_re = re.compile(r'function\(a\)\{(a=a\.split\(""\)[^\}{]+)return a\.join\(""\)\}')
|
||||||
|
op_with_arg_re = re.compile(r'[^\.]+\.([^\(]+)\(a,(\d+)\)')
|
||||||
|
def extract_decryption_function(info, base_js):
|
||||||
|
'''Insert decryption function into info. Return error string if not successful.
|
||||||
|
Decryption function is a list of list[2] of numbers.
|
||||||
|
It is advisable to cache the decryption function (uniquely identified by info['player_name']) so base.js (1 MB) doesn't need to be redownloaded each time'''
|
||||||
|
info['decryption_function'] = None
|
||||||
|
decrypt_function_match = decrypt_function_re.search(base_js)
|
||||||
|
if decrypt_function_match is None:
|
||||||
|
return 'Could not find decryption function in base.js'
|
||||||
|
|
||||||
|
function_body = decrypt_function_match.group(1).split(';')[1:-1]
|
||||||
|
if not function_body:
|
||||||
|
return 'Empty decryption function body'
|
||||||
|
|
||||||
|
var_name = get(function_body[0].split('.'), 0)
|
||||||
|
if var_name is None:
|
||||||
|
return 'Could not find var_name'
|
||||||
|
|
||||||
|
var_body_match = re.search(r'var ' + re.escape(var_name) + r'=\{(.*?)\};', base_js, flags=re.DOTALL)
|
||||||
|
if var_body_match is None:
|
||||||
|
return 'Could not find var_body'
|
||||||
|
|
||||||
|
operations = var_body_match.group(1).replace('\n', '').split('},')
|
||||||
|
if not operations:
|
||||||
|
return 'Did not find any definitions in var_body'
|
||||||
|
operations[-1] = operations[-1][:-1] # remove the trailing '}' since we split by '},' on the others
|
||||||
|
operation_definitions = {}
|
||||||
|
for op in operations:
|
||||||
|
colon_index = op.find(':')
|
||||||
|
opening_brace_index = op.find('{')
|
||||||
|
|
||||||
|
if colon_index == -1 or opening_brace_index == -1:
|
||||||
|
return 'Could not parse operation'
|
||||||
|
op_name = op[:colon_index]
|
||||||
|
op_body = op[opening_brace_index+1:]
|
||||||
|
if op_body == 'a.reverse()':
|
||||||
|
operation_definitions[op_name] = 0
|
||||||
|
elif op_body == 'a.splice(0,b)':
|
||||||
|
operation_definitions[op_name] = 1
|
||||||
|
elif op_body.startswith('var c=a[0]'):
|
||||||
|
operation_definitions[op_name] = 2
|
||||||
|
else:
|
||||||
|
return 'Unknown op_body: ' + op_body
|
||||||
|
|
||||||
|
decryption_function = []
|
||||||
|
for op_with_arg in function_body:
|
||||||
|
match = op_with_arg_re.fullmatch(op_with_arg)
|
||||||
|
if match is None:
|
||||||
|
return 'Could not parse operation with arg'
|
||||||
|
op_name = match.group(1)
|
||||||
|
if op_name not in operation_definitions:
|
||||||
|
return 'Unknown op_name: ' + op_name
|
||||||
|
op_argument = match.group(2)
|
||||||
|
decryption_function.append([operation_definitions[op_name], int(op_argument)])
|
||||||
|
|
||||||
|
info['decryption_function'] = decryption_function
|
||||||
|
return False
|
||||||
|
|
||||||
|
def _operation_2(a, b):
|
||||||
|
c = a[0]
|
||||||
|
a[0] = a[b % len(a)]
|
||||||
|
a[b % len(a)] = c
|
||||||
|
|
||||||
|
def decrypt_signatures(info):
|
||||||
|
'''Applies info['decryption_function'] to decrypt all the signatures. Return err.'''
|
||||||
|
if not info.get('decryption_function'):
|
||||||
|
return 'decryption_function not in info'
|
||||||
|
for format in info['formats']:
|
||||||
|
if not format['s'] or not format['sp'] or not format['url']:
|
||||||
|
print('Warning: s, sp, or url not in format')
|
||||||
|
continue
|
||||||
|
|
||||||
|
a = list(format['s'])
|
||||||
|
for op, argument in info['decryption_function']:
|
||||||
|
if op == 0:
|
||||||
|
a.reverse()
|
||||||
|
elif op == 1:
|
||||||
|
a = a[argument:]
|
||||||
|
else:
|
||||||
|
_operation_2(a, argument)
|
||||||
|
|
||||||
|
signature = ''.join(a)
|
||||||
|
format['url'] += '&' + format['sp'] + '=' + signature
|
||||||
|
return False
|
@ -1,4 +1,4 @@
|
|||||||
from youtube_data import proto, utils
|
from youtube_data import proto
|
||||||
from flask import Markup as mk
|
from flask import Markup as mk
|
||||||
import requests
|
import requests
|
||||||
import base64
|
import base64
|
||||||
|
@ -1,130 +0,0 @@
|
|||||||
from youtube_data import proto
|
|
||||||
import json
|
|
||||||
import base64
|
|
||||||
import urllib
|
|
||||||
import requests
|
|
||||||
import re
|
|
||||||
import bleach
|
|
||||||
from flask import Markup
|
|
||||||
|
|
||||||
URL_ORIGIN = "/https://www.youtube.com"
|
|
||||||
|
|
||||||
def make_comment_ctoken(video_id, sort=0, offset=0, lc='', secret_key=''):
|
|
||||||
video_id = proto.as_bytes(video_id)
|
|
||||||
secret_key = proto.as_bytes(secret_key)
|
|
||||||
|
|
||||||
|
|
||||||
page_info = proto.string(4,video_id) + proto.uint(6, sort)
|
|
||||||
offset_information = proto.nested(4, page_info) + proto.uint(5, offset)
|
|
||||||
if secret_key:
|
|
||||||
offset_information = proto.string(1, secret_key) + offset_information
|
|
||||||
|
|
||||||
page_params = proto.string(2, video_id)
|
|
||||||
if lc:
|
|
||||||
page_params += proto.string(6, proto.percent_b64encode(proto.string(15, lc)))
|
|
||||||
|
|
||||||
result = proto.nested(2, page_params) + proto.uint(3,6) + proto.nested(6, offset_information)
|
|
||||||
return base64.urlsafe_b64encode(result).decode('ascii')
|
|
||||||
|
|
||||||
def comment_replies_ctoken(video_id, comment_id, max_results=500):
|
|
||||||
|
|
||||||
params = proto.string(2, comment_id) + proto.uint(9, max_results)
|
|
||||||
params = proto.nested(3, params)
|
|
||||||
|
|
||||||
result = proto.nested(2, proto.string(2, video_id)) + proto.uint(3,6) + proto.nested(6, params)
|
|
||||||
return base64.urlsafe_b64encode(result).decode('ascii')
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
mobile_headers = {
|
|
||||||
'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_1 like Mac OS X) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/10.0 Mobile/14E304 Safari/602.1',
|
|
||||||
'Accept': '*/*',
|
|
||||||
'Accept-Language': 'en-US,en;q=0.5',
|
|
||||||
'X-YouTube-Client-Name': '2',
|
|
||||||
'X-YouTube-Client-Version': '2.20180823',
|
|
||||||
}
|
|
||||||
def request_comments(ctoken, replies=False):
|
|
||||||
if replies: # let's make it use different urls for no reason despite all the data being encoded
|
|
||||||
base_url = "https://m.youtube.com/watch_comment?action_get_comment_replies=1&ctoken="
|
|
||||||
else:
|
|
||||||
base_url = "https://m.youtube.com/watch_comment?action_get_comments=1&ctoken="
|
|
||||||
url = base_url + ctoken.replace("=", "%3D") + "&pbj=1"
|
|
||||||
|
|
||||||
for i in range(0,8): # don't retry more than 8 times
|
|
||||||
content = requests.get(url, headers=mobile_headers).text
|
|
||||||
if content[0:4] == b")]}'": # random closing characters included at beginning of response for some reason
|
|
||||||
content = content[4:]
|
|
||||||
elif content[0:10] == b'\n<!DOCTYPE': # occasionally returns html instead of json for no reason
|
|
||||||
content = b''
|
|
||||||
print("got <!DOCTYPE>, retrying")
|
|
||||||
continue
|
|
||||||
break
|
|
||||||
|
|
||||||
polymer_json = json.loads(content)
|
|
||||||
return polymer_json
|
|
||||||
|
|
||||||
|
|
||||||
def single_comment_ctoken(video_id, comment_id):
|
|
||||||
page_params = proto.string(2, video_id) + proto.string(6, proto.percent_b64encode(proto.string(15, comment_id)))
|
|
||||||
|
|
||||||
result = proto.nested(2, page_params) + proto.uint(3,6)
|
|
||||||
return base64.urlsafe_b64encode(result).decode('ascii')
|
|
||||||
|
|
||||||
|
|
||||||
def concat_texts(strings):
|
|
||||||
'''Concatenates strings. Returns None if any of the arguments are None'''
|
|
||||||
result = ''
|
|
||||||
for string in strings:
|
|
||||||
if string['text'] is None:
|
|
||||||
return None
|
|
||||||
result += string['text']
|
|
||||||
return result
|
|
||||||
|
|
||||||
def parse_comment(raw_comment):
|
|
||||||
cmnt = {}
|
|
||||||
print(raw_comment)
|
|
||||||
raw_comment = raw_comment['commentThreadRenderer']['comment']['commentRenderer']
|
|
||||||
imgHostName = urllib.parse.urlparse(raw_comment['authorThumbnail']['thumbnails'][0]['url']).netloc
|
|
||||||
cmnt['author'] = raw_comment['authorText']['runs'][0]['text']
|
|
||||||
cmnt['thumbnail'] = raw_comment['authorThumbnail']['thumbnails'][0]['url'].replace("https://{}".format(imgHostName), "")+"?host="+imgHostName
|
|
||||||
cmnt['channel'] = raw_comment['authorEndpoint']['commandMetadata']['webCommandMetadata']['url']
|
|
||||||
cmnt['text'] = Markup(bleach.linkify(concat_texts(raw_comment['contentText']['runs']).replace("\n", "<br>")))
|
|
||||||
cmnt['date'] = raw_comment['publishedTimeText']['runs'][0]['text']
|
|
||||||
|
|
||||||
try:
|
|
||||||
cmnt['creatorHeart'] = raw_comment['creatorHeart']['creatorHeartRenderer']['creatorThumbnail']['thumbnails'][0]['url']
|
|
||||||
except:
|
|
||||||
cmnt['creatorHeart'] = False
|
|
||||||
|
|
||||||
try:
|
|
||||||
cmnt['likes'] = raw_comment['likeCount']
|
|
||||||
except:
|
|
||||||
cmnt['likes'] = 0
|
|
||||||
|
|
||||||
try:
|
|
||||||
cmnt['replies'] = raw_comment['replyCount']
|
|
||||||
except:
|
|
||||||
cmnt['replies'] = 0
|
|
||||||
|
|
||||||
cmnt['authorIsChannelOwner'] = raw_comment['authorIsChannelOwner']
|
|
||||||
try:
|
|
||||||
cmnt['pinned'] = raw_comment['pinnedCommentBadge']
|
|
||||||
cmnt['pinned'] = True
|
|
||||||
except:
|
|
||||||
cmnt['pinned'] = False
|
|
||||||
return cmnt
|
|
||||||
|
|
||||||
def post_process_comments_info(comments_info):
|
|
||||||
comments = []
|
|
||||||
for comment in comments_info[1]['response']['continuationContents']['commentSectionContinuation']['items']:
|
|
||||||
comments.append(parse_comment(comment))
|
|
||||||
return comments
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def video_comments(video_id, sort=0, offset=0, lc='', secret_key=''):
|
|
||||||
comments_info = request_comments(make_comment_ctoken(video_id, sort, offset, lc, secret_key))
|
|
||||||
comments_info = post_process_comments_info(comments_info)
|
|
||||||
return comments_info
|
|
||||||
return {}
|
|
||||||
|
|
@ -1,5 +1,5 @@
|
|||||||
from youtube_data import proto, utils
|
from youtube_data import proto
|
||||||
from bs4 import BeautifulSoup as bs
|
from youtube import utils
|
||||||
from flask import Markup
|
from flask import Markup
|
||||||
import urllib.parse
|
import urllib.parse
|
||||||
import requests
|
import requests
|
||||||
|
@ -1,12 +0,0 @@
|
|||||||
def get_description_snippet_text(ds):
|
|
||||||
string = ""
|
|
||||||
for t in ds:
|
|
||||||
try:
|
|
||||||
if t['bold']:
|
|
||||||
text = "<b>"+t['text']+"</b>"
|
|
||||||
else:
|
|
||||||
text = t['text']
|
|
||||||
except:
|
|
||||||
text = t['text']
|
|
||||||
string = string + text
|
|
||||||
return string
|
|
@ -1,281 +0,0 @@
|
|||||||
from bs4 import BeautifulSoup as bs
|
|
||||||
from urllib.parse import unquote
|
|
||||||
from youtube_dl import YoutubeDL
|
|
||||||
import urllib.parse
|
|
||||||
import requests
|
|
||||||
import json
|
|
||||||
|
|
||||||
# from https://github.com/ytdl-org/youtube-dl/blob/master/youtube_dl/extractor/youtube.py
|
|
||||||
_formats = {
|
|
||||||
'5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'audio_bitrate': 64, 'vcodec': 'h263'},
|
|
||||||
'6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'audio_bitrate': 64, 'vcodec': 'h263'},
|
|
||||||
'13': {'ext': '3gp', 'acodec': 'aac', 'vcodec': 'mp4v'},
|
|
||||||
'17': {'ext': '3gp', 'width': 176, 'height': 144, 'acodec': 'aac', 'audio_bitrate': 24, 'vcodec': 'mp4v'},
|
|
||||||
'18': {'ext': 'mp4', 'width': 640, 'height': 360, 'acodec': 'aac', 'audio_bitrate': 96, 'vcodec': 'h264'},
|
|
||||||
'22': {'ext': 'mp4', 'width': 1280, 'height': 720, 'acodec': 'aac', 'audio_bitrate': 192, 'vcodec': 'h264'},
|
|
||||||
'34': {'ext': 'flv', 'width': 640, 'height': 360, 'acodec': 'aac', 'audio_bitrate': 128, 'vcodec': 'h264'},
|
|
||||||
'35': {'ext': 'flv', 'width': 854, 'height': 480, 'acodec': 'aac', 'audio_bitrate': 128, 'vcodec': 'h264'},
|
|
||||||
# itag 36 videos are either 320x180 (BaW_jenozKc) or 320x240 (__2ABJjxzNo), audio_bitrate varies as well
|
|
||||||
'36': {'ext': '3gp', 'width': 320, 'acodec': 'aac', 'vcodec': 'mp4v'},
|
|
||||||
'37': {'ext': 'mp4', 'width': 1920, 'height': 1080, 'acodec': 'aac', 'audio_bitrate': 192, 'vcodec': 'h264'},
|
|
||||||
'38': {'ext': 'mp4', 'width': 4096, 'height': 3072, 'acodec': 'aac', 'audio_bitrate': 192, 'vcodec': 'h264'},
|
|
||||||
'43': {'ext': 'webm', 'width': 640, 'height': 360, 'acodec': 'vorbis', 'audio_bitrate': 128, 'vcodec': 'vp8'},
|
|
||||||
'44': {'ext': 'webm', 'width': 854, 'height': 480, 'acodec': 'vorbis', 'audio_bitrate': 128, 'vcodec': 'vp8'},
|
|
||||||
'45': {'ext': 'webm', 'width': 1280, 'height': 720, 'acodec': 'vorbis', 'audio_bitrate': 192, 'vcodec': 'vp8'},
|
|
||||||
'46': {'ext': 'webm', 'width': 1920, 'height': 1080, 'acodec': 'vorbis', 'audio_bitrate': 192, 'vcodec': 'vp8'},
|
|
||||||
'59': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'audio_bitrate': 128, 'vcodec': 'h264'},
|
|
||||||
'78': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'audio_bitrate': 128, 'vcodec': 'h264'},
|
|
||||||
|
|
||||||
|
|
||||||
# 3D videos
|
|
||||||
'82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'acodec': 'aac', 'audio_bitrate': 128, 'vcodec': 'h264'},
|
|
||||||
'83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'acodec': 'aac', 'audio_bitrate': 128, 'vcodec': 'h264'},
|
|
||||||
'84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'acodec': 'aac', 'audio_bitrate': 192, 'vcodec': 'h264'},
|
|
||||||
'85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'acodec': 'aac', 'audio_bitrate': 192, 'vcodec': 'h264'},
|
|
||||||
'100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'acodec': 'vorbis', 'audio_bitrate': 128, 'vcodec': 'vp8'},
|
|
||||||
'101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'acodec': 'vorbis', 'audio_bitrate': 192, 'vcodec': 'vp8'},
|
|
||||||
'102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'audio_bitrate': 192, 'vcodec': 'vp8'},
|
|
||||||
|
|
||||||
# Apple HTTP Live Streaming
|
|
||||||
'91': {'ext': 'mp4', 'height': 144, 'format_note': 'HLS', 'acodec': 'aac', 'audio_bitrate': 48, 'vcodec': 'h264'},
|
|
||||||
'92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'audio_bitrate': 48, 'vcodec': 'h264'},
|
|
||||||
'93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'audio_bitrate': 128, 'vcodec': 'h264'},
|
|
||||||
'94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'audio_bitrate': 128, 'vcodec': 'h264'},
|
|
||||||
'95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'acodec': 'aac', 'audio_bitrate': 256, 'vcodec': 'h264'},
|
|
||||||
'96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'acodec': 'aac', 'audio_bitrate': 256, 'vcodec': 'h264'},
|
|
||||||
'132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'audio_bitrate': 48, 'vcodec': 'h264'},
|
|
||||||
'151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'audio_bitrate': 24, 'vcodec': 'h264'},
|
|
||||||
|
|
||||||
# DASH mp4 video
|
|
||||||
'133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264'},
|
|
||||||
'134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'h264'},
|
|
||||||
'135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
|
|
||||||
'136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264'},
|
|
||||||
'137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264'},
|
|
||||||
'138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264'}, # Height can vary (https://github.com/ytdl-org/youtube-dl/issues/4559)
|
|
||||||
'160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264'},
|
|
||||||
'212': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
|
|
||||||
'264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264'},
|
|
||||||
'298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
|
|
||||||
'299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
|
|
||||||
'266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264'},
|
|
||||||
|
|
||||||
# Dash mp4 audio
|
|
||||||
'139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'audio_bitrate': 48, 'container': 'm4a_dash'},
|
|
||||||
'140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'audio_bitrate': 128, 'container': 'm4a_dash'},
|
|
||||||
'141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'audio_bitrate': 256, 'container': 'm4a_dash'},
|
|
||||||
'256': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
|
|
||||||
'258': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
|
|
||||||
'325': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'dtse', 'container': 'm4a_dash'},
|
|
||||||
'328': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'ec-3', 'container': 'm4a_dash'},
|
|
||||||
|
|
||||||
# Dash webm
|
|
||||||
'167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
|
|
||||||
'168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
|
|
||||||
'169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
|
|
||||||
'170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
|
|
||||||
'218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
|
|
||||||
'219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
|
|
||||||
'278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp9'},
|
|
||||||
'242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'vp9'},
|
|
||||||
'243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'vp9'},
|
|
||||||
'244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
|
|
||||||
'245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
|
|
||||||
'246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
|
|
||||||
'247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9'},
|
|
||||||
'248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9'},
|
|
||||||
'271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9'},
|
|
||||||
# itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug)
|
|
||||||
'272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
|
|
||||||
'302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
|
|
||||||
'303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
|
|
||||||
'308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
|
|
||||||
'313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
|
|
||||||
'315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
|
|
||||||
|
|
||||||
# Dash webm audio
|
|
||||||
'171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'audio_bitrate': 128},
|
|
||||||
'172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'audio_bitrate': 256},
|
|
||||||
|
|
||||||
# Dash webm audio with opus inside
|
|
||||||
'249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'audio_bitrate': 50},
|
|
||||||
'250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'audio_bitrate': 70},
|
|
||||||
'251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'audio_bitrate': 160},
|
|
||||||
|
|
||||||
# RTMP (unnamed)
|
|
||||||
'_rtmp': {'protocol': 'rtmp'},
|
|
||||||
|
|
||||||
# av01 video only formats sometimes served with "unknown" codecs
|
|
||||||
'394': {'vcodec': 'av01.0.05M.08'},
|
|
||||||
'395': {'vcodec': 'av01.0.05M.08'},
|
|
||||||
'396': {'vcodec': 'av01.0.05M.08'},
|
|
||||||
'397': {'vcodec': 'av01.0.05M.08'},
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def get_renderer_key(renderer, key):
|
|
||||||
for k in renderer:
|
|
||||||
if key in k:
|
|
||||||
return k[key]
|
|
||||||
|
|
||||||
def get_video_primary_info(datad, datai):
|
|
||||||
contents = datai["contents"]["twoColumnWatchNextResults"]['results']['results']['contents']
|
|
||||||
item = get_renderer_key(contents, "videoPrimaryInfoRenderer")
|
|
||||||
details = datad['videoDetails']
|
|
||||||
|
|
||||||
# Check if is Livestream
|
|
||||||
if details.get('isLive') and details['lengthSeconds'] == '0':
|
|
||||||
isLive = True
|
|
||||||
else:
|
|
||||||
isLive = False
|
|
||||||
|
|
||||||
# Check if is a Scheduled video
|
|
||||||
if details.get('isUpcoming') == True:
|
|
||||||
isUpcoming = True
|
|
||||||
views = "Scheduled video"
|
|
||||||
premieres = item['dateText']['simpleText']
|
|
||||||
audioURL = False
|
|
||||||
else:
|
|
||||||
isUpcoming = False
|
|
||||||
premieres = False
|
|
||||||
views = details['viewCount']
|
|
||||||
|
|
||||||
ydl = YoutubeDL()
|
|
||||||
|
|
||||||
if isUpcoming == False:
|
|
||||||
data = ydl.extract_info(details['videoId'], False)
|
|
||||||
while not data['formats']:
|
|
||||||
data = ydl.extract_info(details['videoId'], False)
|
|
||||||
formats = data['formats']
|
|
||||||
|
|
||||||
## Get audio
|
|
||||||
audio_urls = []
|
|
||||||
for f in data['formats']:
|
|
||||||
for fid in _formats:
|
|
||||||
if f['format_id'] == fid:
|
|
||||||
try:
|
|
||||||
if 'audio' in _formats[fid]['format_note']:
|
|
||||||
aurl = f['url']
|
|
||||||
fnote = _formats[fid]['format_note']
|
|
||||||
bitrate = _formats[fid]['audio_bitrate']
|
|
||||||
audio_inf = {
|
|
||||||
"url":aurl,
|
|
||||||
"id":fnote,
|
|
||||||
"btr": bitrate
|
|
||||||
}
|
|
||||||
audio_urls.append(audio_inf)
|
|
||||||
except:
|
|
||||||
continue
|
|
||||||
if not isLive:
|
|
||||||
audioURL = audio_urls[-1]['url']
|
|
||||||
else:
|
|
||||||
audioURL = False
|
|
||||||
else: # If it is a scheduled video
|
|
||||||
audio_urls = False
|
|
||||||
formats = False
|
|
||||||
try:
|
|
||||||
primaryInfo = {
|
|
||||||
"id": details['videoId'],
|
|
||||||
"title": details['title'],
|
|
||||||
"description": details['shortDescription'],
|
|
||||||
"views": views,
|
|
||||||
"duration": details['lengthSeconds'],
|
|
||||||
"date": item['dateText']['simpleText'],
|
|
||||||
"rating": details['averageRating'],
|
|
||||||
"author": details['author'],
|
|
||||||
"isPrivate": details['isPrivate'],
|
|
||||||
"isLive": isLive,
|
|
||||||
"isUpcoming": isUpcoming,
|
|
||||||
"url":url,
|
|
||||||
"allowRatings": details['allowRatings'],
|
|
||||||
"urls":formats,
|
|
||||||
"thumbnail": details['thumbnail']['thumbnails'][0]['url'],
|
|
||||||
"audio": audioURL,
|
|
||||||
"premieres": premieres
|
|
||||||
}
|
|
||||||
except:
|
|
||||||
# If error take only most common items
|
|
||||||
primaryInfo = {
|
|
||||||
"id": details['videoId'],
|
|
||||||
"title": details['title'],
|
|
||||||
"description": details['shortDescription'],
|
|
||||||
"views": details['viewCount'],
|
|
||||||
"duration": details['lengthSeconds'],
|
|
||||||
"date": item['dateText']['simpleText'],
|
|
||||||
"rating": details['averageRating'],
|
|
||||||
"author": details['author'],
|
|
||||||
"isPrivate":False,
|
|
||||||
"isLive":isLive,
|
|
||||||
"isUpcoming":isUpcoming,
|
|
||||||
"allowRatings":True,
|
|
||||||
"urls":formats,
|
|
||||||
"thumbnail": details['thumbnail']['thumbnails'][0]['url'],
|
|
||||||
"audio": audioURL,
|
|
||||||
"premieres": premieres
|
|
||||||
}
|
|
||||||
return primaryInfo
|
|
||||||
|
|
||||||
def get_video_owner_info(data):
|
|
||||||
contents = data["contents"]["twoColumnWatchNextResults"]['results']['results']['contents']
|
|
||||||
item = get_renderer_key(contents, "videoSecondaryInfoRenderer")
|
|
||||||
ownerItem = item['owner']['videoOwnerRenderer']
|
|
||||||
|
|
||||||
try:
|
|
||||||
sC = ownerItem['subscriberCountText']['runs'][0]['text']
|
|
||||||
except:
|
|
||||||
sC = "Unknown"
|
|
||||||
ownerInfo = {
|
|
||||||
"thumbnail": ownerItem['thumbnail']['thumbnails'][0]['url'],
|
|
||||||
"username": ownerItem['title']['runs'][0]['text'],
|
|
||||||
"id": ownerItem['title']['runs'][0]['navigationEndpoint']['browseEndpoint']['browseId'],
|
|
||||||
"suscriberCount":sC
|
|
||||||
}
|
|
||||||
return ownerInfo
|
|
||||||
|
|
||||||
def get_video_info(id):
|
|
||||||
headers = {"Accept-Language": "en-US,en;q=0.5"}
|
|
||||||
encoded_search = urllib.parse.quote(id)
|
|
||||||
BASE_URL = "https://youtube.com"
|
|
||||||
|
|
||||||
url = f"{BASE_URL}/watch?v={encoded_search}"
|
|
||||||
response = requests.get(url, headers=headers).text
|
|
||||||
|
|
||||||
while 'window["ytInitialData"]' and 'window["ytInitialData"]' not in response:
|
|
||||||
response = requests.get(url, headers=headers).text
|
|
||||||
|
|
||||||
start = (
|
|
||||||
response.index('window["ytInitialData"]')
|
|
||||||
+ len('window["ytInitialData"]')
|
|
||||||
+ 3
|
|
||||||
)
|
|
||||||
|
|
||||||
start2 = (
|
|
||||||
response.index('window["ytInitialPlayerResponse"]')
|
|
||||||
+ len('window["ytInitialPlayerResponse"]') + 3
|
|
||||||
)
|
|
||||||
|
|
||||||
end1 = response.index("};", start) + 1
|
|
||||||
end2 = response.index("};", start2) + 1
|
|
||||||
jsonIni = response[start:end1]
|
|
||||||
dataInitial = json.loads(jsonIni)
|
|
||||||
|
|
||||||
try:
|
|
||||||
jsonDet = response[start2:end2]
|
|
||||||
dataDetails = json.loads(jsonDet)
|
|
||||||
except:
|
|
||||||
response = requests.get(url, headers=headers).json()
|
|
||||||
jsonDet = response[start2:end2]
|
|
||||||
dataDetails = json.loads(jsonDet)
|
|
||||||
|
|
||||||
|
|
||||||
#title, views, date
|
|
||||||
videoInfo = get_video_primary_info(dataDetails, dataInitial)
|
|
||||||
ownerInfo = get_video_owner_info(dataInitial)
|
|
||||||
|
|
||||||
'''soup = bs(response, "html.parser")
|
|
||||||
soup = str(str(soup.find("div", attrs={"id":"player-wrap"}).find_all("script")).split("ytplayer.config =")[1]).split("url")
|
|
||||||
for url in soup:
|
|
||||||
if "googlevideo" in url:
|
|
||||||
print(unquote(url.replace("\\", "")))'''
|
|
||||||
info = {"video":videoInfo, "owner":ownerInfo}
|
|
||||||
return info
|
|
Reference in New Issue
Block a user