Add data extractor module

This commit is contained in:
pluja 2020-09-10 02:24:32 +02:00
parent f6e70fdbe3
commit a7c1eff8e2
2 changed files with 175 additions and 0 deletions

59
youtube_data/utils.py Normal file
View File

@ -0,0 +1,59 @@
import requests
import urllib.parse
import json
from bs4 import BeautifulSoup as bs
nested_renderer_dispatch = {
'singleColumnBrowseResultsRenderer',
'twoColumnBrowseResultsRenderer', # Channel renderer
'twoColumnSearchResultsRenderer',
}
# these renderers contain a list of renderers inside them
nested_renderer_list_dispatch = {
'sectionListRenderer',
'itemSectionRenderer',
'gridRenderer',
'playlistVideoListRenderer',
'singleColumnWatchNextResults',
}
_item_types = {
'movieRenderer',
'didYouMeanRenderer',
'showingResultsForRenderer',
'videoRenderer',
'compactVideoRenderer',
'compactAutoplayRenderer',
'videoWithContextRenderer',
'gridVideoRenderer',
'playlistVideoRenderer',
'playlistRenderer',
'compactPlaylistRenderer',
'gridPlaylistRenderer',
'radioRenderer',
'compactRadioRenderer',
'gridRadioRenderer',
'showRenderer',
'compactShowRenderer',
'gridShowRenderer',
'channelRenderer',
'compactChannelRenderer',
'gridChannelRenderer',
}
def getRenderers(data):
renderers = []
for renderer in nested_renderer_dispatch:
renderers.append(data['contents'][renderer])
return renderers
def getRenderedItems(renderer):
'''Given a renderer, return its items'''

116
youtube_data/videos.py Normal file
View File

@ -0,0 +1,116 @@
from bs4 import BeautifulSoup as bs
from urllib.parse import unquote
from youtube_dl import YoutubeDL
import urllib.parse
import requests
import json
def get_renderer_key(renderer, key):
for k in renderer:
if key in k:
return k[key]
def get_video_primary_info(datad, datai):
contents = datai["contents"]["twoColumnWatchNextResults"]['results']['results']['contents']
item = get_renderer_key(contents, "videoPrimaryInfoRenderer")
details = datad['videoDetails']
try:
isUpcoming = details['isUpcoming']
except:
isUpcoming = False
ydl = YoutubeDL()
data = ydl.extract_info(details['videoId'], False)
if not details['isLiveContent']:
url = data['formats'][-1]['url']
try:
primaryInfo = {
"id": details['videoId'],
"title": details['title'],
"description": details['shortDescription'],
"views": details['viewCount'],
"duration": details['lengthSeconds'],
"date": item['dateText']['simpleText'],
"rating": details['averageRating'],
"author": details['author'],
"isPrivate": details['isPrivate'],
"isLive": details['isLiveContent'],
"isUpcoming": isUpcoming,
"allowRatings": details['allowRatings'],
"url":url,
"thumbnail": details['thumbnail']['thumbnails'][0]['url']
}
except:
# If error take only most common items
primaryInfo = {
"id": details['videoId'],
"title": details['title'],
"description": details['shortDescription'],
"views": details['viewCount'],
"duration": details['lengthSeconds'],
"date": item['dateText']['simpleText'],
"rating": details['averageRating'],
"author": details['author'],
"isPrivate":False,
"isLive":False,
"isUpcoming":False,
"allowRatings":True,
"thumbnail": details['thumbnail']['thumbnails'][0]['url']
}
return primaryInfo
def get_video_owner_info(data):
contents = data["contents"]["twoColumnWatchNextResults"]['results']['results']['contents']
item = get_renderer_key(contents, "videoSecondaryInfoRenderer")
ownerItem = item['owner']['videoOwnerRenderer']
ownerInfo = {
"thumbnail": ownerItem['thumbnail']['thumbnails'][0]['url'],
"username": ownerItem['title']['runs'][0]['text'],
"id": "#",
"suscriberCount":ownerItem['subscriberCountText']['runs'][0]['text']
}
return ownerInfo
def get_video_info(id):
headers = {"Accept-Language": "en-US,en;q=0.5"}
encoded_search = urllib.parse.quote(id)
BASE_URL = "https://youtube.com"
url = f"{BASE_URL}/watch?v={encoded_search}"
response = requests.get(url, headers=headers).text
while 'window["ytInitialData"]' and 'window["ytInitialData"]' not in response:
response = requests.get(url, headers=headers).text
start = (
response.index('window["ytInitialData"]')
+ len('window["ytInitialData"]')
+ 3
)
start2 = (
response.index('window["ytInitialPlayerResponse"]')
+ len('window["ytInitialPlayerResponse"]') + 3
)
end1 = response.index("};", start) + 1
end2 = response.index("};", start2) + 1
jsonIni = response[start:end1]
dataInitial = json.loads(jsonIni)
jsonDet = response[start2:end2]
dataDetails = json.loads(jsonDet)
#title, views, date
videoInfo = get_video_primary_info(dataDetails, dataInitial)
ownerInfo = get_video_owner_info(dataInitial)
'''soup = bs(response, "html.parser")
soup = str(str(soup.find("div", attrs={"id":"player-wrap"}).find_all("script")).split("ytplayer.config =")[1]).split("url")
for url in soup:
if "googlevideo" in url:
print(unquote(url.replace("\\", "")))'''
info = {"video":videoInfo, "owner":ownerInfo}
return info