From a3f4d8978101f34da9eb75fe68404b2f981fdbca Mon Sep 17 00:00:00 2001 From: pluja Date: Fri, 11 Sep 2020 10:34:24 +0200 Subject: [PATCH] Improved Youtube search --- app/routes.py | 37 +++---- app/templates/_video_item.html | 4 +- app/templates/ytsearch.html | 51 ++++++---- youtube_data/proto.py | 130 +++++++++++++++++++++++++ youtube_data/search.py | 171 +++++++++++++++++++++++++++++++++ youtube_data/videos.py | 1 - 6 files changed, 345 insertions(+), 49 deletions(-) create mode 100644 youtube_data/proto.py create mode 100644 youtube_data/search.py diff --git a/app/routes.py b/app/routes.py index fe26ed4..0951eb2 100644 --- a/app/routes.py +++ b/app/routes.py @@ -7,7 +7,6 @@ from requests_futures.sessions import FuturesSession from werkzeug.datastructures import Headers from concurrent.futures import as_completed from werkzeug.utils import secure_filename -from youtube_data import videos as ytvids from youtube_search import YoutubeSearch from werkzeug.urls import url_parse from youtube_dl import YoutubeDL @@ -24,6 +23,10 @@ import bleach import urllib import json import re +######################################### +from youtube_data import videos as ytvids +from youtube_data import search as yts +######################################### ########################## #### Config variables #### ########################## @@ -225,32 +228,14 @@ def ytsearch(): channels = [] videos = [] - searchTerm = form.channelId.data - search = YoutubeSearch(searchTerm) - chnns = search.channels_to_dict() - vids = search.videos_to_dict() - - for v in vids: - videos.append({ - 'channelName':v['channel'], - 'videoTitle':v['title'], - 'description':Markup(v['long_desc']), - 'id':v['id'], - 'videoThumb': v['thumbnails'][-1], - 'channelUrl':v['url_suffix'], - 'channelId': v['channelId'], - 'views':v['views'], - 'timeStamp':v['publishedText'] - }) + searchTerms = form.channelId.data + page = 1 + autocorrect = 1 + sort = 0 + filters = {"time":0, "type":0, "duration":0} + results = yts.search_by_terms(searchTerms, page, autocorrect, sort, filters) - for c in chnns: - channels.append({ - 'username':c['name'], - 'channelId':c['id'], - 'thumbnail':'https:{}'.format(c['thumbnails'][0]), - 'subCount':c['suscriberCountText'].split(" ")[0] - }) - return render_template('ytsearch.html', form=form, btform=button_form, channels=channels, videos=videos, restricted=config['restrictPublicUsage'], config=config) + return render_template('ytsearch.html', form=form, btform=button_form, results=results, restricted=config['restrictPublicUsage'], config=config) else: return render_template('ytsearch.html', form=form) diff --git a/app/templates/_video_item.html b/app/templates/_video_item.html index 0c8b2cf..baa2a77 100644 --- a/app/templates/_video_item.html +++ b/app/templates/_video_item.html @@ -16,7 +16,7 @@
- {% if video.views == "Livestream" %} + {% if video.isLive == "Livestream" or video.isLive %} ‎‎{{video.views}} @@ -28,7 +28,7 @@ {% endif %} - {% if video.timeStamp == "Scheduled" %} + {% if video.timeStamp == "Scheduled" or video.isUpcoming %} {{video.timeStamp}}‎‎‎‎ ‎‎‎ diff --git a/app/templates/ytsearch.html b/app/templates/ytsearch.html index 9b20fd7..fee2066 100644 --- a/app/templates/ytsearch.html +++ b/app/templates/ytsearch.html @@ -14,37 +14,48 @@

{{ form.submit() }}

- {% if channels %} + {% if results.channels %}

Users

- {% for res in channels %} + {% for res in results.channels %}
- {% if restricted or current_user.is_authenticated %} -
- {% if not current_user.is_following_yt(res.channelId) %} -

+

+ +
+
+ {{res.username}} +
+ {{res.description}} +
+
+

+
+
+
+ {{res.suscribers}} +
+ +
+ {{res.videos}} +
+ + {% if restricted or current_user.is_authenticated %} +
+ {% if not current_user.is_following_yt(res.channelId) %}
{{ btform.hidden_tag() }} {{ btform.submit(value='Follow') }}
-

- {% else %} -

+ {% else %}

{{ btform.hidden_tag() }} {{ btform.submit(value='Unfollow') }}
-

+ {% endif %} +
{% endif %} -
- {% endif %} - Avatar -
- {{res.username}} -
- {{res.subCount}} -
+
{% endfor %} @@ -53,10 +64,10 @@
- {% if videos %} + {% if results.videos %}

Videos

- {% for video in videos %} + {% for video in results.videos %} {% include '_video_item.html' %} {% endfor %}
diff --git a/youtube_data/proto.py b/youtube_data/proto.py new file mode 100644 index 0000000..3c74083 --- /dev/null +++ b/youtube_data/proto.py @@ -0,0 +1,130 @@ +from math import ceil +import base64 +import io + +# FROM https://github.com/user234683/youtube-local/blob/master/youtube/proto.py + +def byte(n): + return bytes((n,)) + + +def varint_encode(offset): + '''In this encoding system, for each 8-bit byte, the first bit is 1 if there are more bytes, and 0 is this is the last one. + The next 7 bits are data. These 7-bit sections represent the data in Little endian order. For example, suppose the data is + aaaaaaabbbbbbbccccccc (each of these sections is 7 bits). It will be encoded as: + 1ccccccc 1bbbbbbb 0aaaaaaa + + This encoding is used in youtube parameters to encode offsets and to encode the length for length-prefixed data. + See https://developers.google.com/protocol-buffers/docs/encoding#varints for more info.''' + needed_bytes = ceil(offset.bit_length()/7) or 1 # (0).bit_length() returns 0, but we need 1 in that case. + encoded_bytes = bytearray(needed_bytes) + for i in range(0, needed_bytes - 1): + encoded_bytes[i] = (offset & 127) | 128 # 7 least significant bits + offset = offset >> 7 + encoded_bytes[-1] = offset & 127 # leave first bit as zero for last byte + + return bytes(encoded_bytes) + + +def varint_decode(encoded): + decoded = 0 + for i, byte in enumerate(encoded): + decoded |= (byte & 127) << 7*i + + if not (byte & 128): + break + return decoded + + +def string(field_number, data): + data = as_bytes(data) + return _proto_field(2, field_number, varint_encode(len(data)) + data) +nested = string + +def uint(field_number, value): + return _proto_field(0, field_number, varint_encode(value)) + + + + +def _proto_field(wire_type, field_number, data): + ''' See https://developers.google.com/protocol-buffers/docs/encoding#structure ''' + return varint_encode( (field_number << 3) | wire_type) + data + + + +def percent_b64encode(data): + return base64.urlsafe_b64encode(data).replace(b'=', b'%3D') + + +def unpadded_b64encode(data): + return base64.urlsafe_b64encode(data).replace(b'=', b'') + +def as_bytes(value): + if isinstance(value, str): + return value.encode('utf-8') + return value + + +def read_varint(data): + result = 0 + i = 0 + while True: + try: + byte = data.read(1)[0] + except IndexError: + if i == 0: + raise EOFError() + raise Exception('Unterminated varint starting at ' + str(data.tell() - i)) + result |= (byte & 127) << 7*i + if not byte & 128: + break + + i += 1 + return result + + +def read_group(data, end_sequence): + start = data.tell() + index = data.original.find(end_sequence, start) + if index == -1: + raise Exception('Unterminated group') + data.seek(index + len(end_sequence)) + return data.original[start:index] + +def read_protobuf(data): + data_original = data + data = io.BytesIO(data) + data.original = data_original + while True: + try: + tag = read_varint(data) + except EOFError: + break + wire_type = tag & 7 + field_number = tag >> 3 + + if wire_type == 0: + value = read_varint(data) + elif wire_type == 1: + value = data.read(8) + elif wire_type == 2: + length = read_varint(data) + value = data.read(length) + elif wire_type == 3: + end_bytes = encode_varint((field_number << 3) | 4) + value = read_group(data, end_bytes) + elif wire_type == 5: + value = data.read(4) + else: + raise Exception("Unknown wire type: " + str(wire_type) + ", Tag: " + bytes_to_hex(succinct_encode(tag)) + ", at position " + str(data.tell())) + yield (wire_type, field_number, value) + +def parse(data): + return {field_number: value for _, field_number, value in read_protobuf(data)} + +def b64_to_bytes(data): + if isinstance(data, bytes): + data = data.decode('ascii') + data = data.replace("%3D", "=") + return base64.urlsafe_b64decode(data + "="*((4 - len(data)%4)%4) ) diff --git a/youtube_data/search.py b/youtube_data/search.py new file mode 100644 index 0000000..3bf2452 --- /dev/null +++ b/youtube_data/search.py @@ -0,0 +1,171 @@ +from bs4 import BeautifulSoup as bs +from youtube_data import proto +from flask import Markup +import urllib.parse +import requests +import base64 +import json + +def page_number_to_sp_parameter(page, autocorrect, sort, filters): + offset = (int(page) - 1)*20 # 20 results per page + autocorrect = proto.nested(8, proto.uint(1, 1 - int(autocorrect) )) + filters_enc = proto.nested(2, proto.uint(1, filters['time']) + proto.uint(2, filters['type']) + proto.uint(3, filters['duration'])) + result = proto.uint(1, sort) + filters_enc + autocorrect + proto.uint(9, offset) + proto.string(61, b'') + return base64.urlsafe_b64encode(result).decode('ascii') + +def search_by_terms(search_terms, page, autocorrect, sort, filters): + url = "https://www.youtube.com/results?search_query=" + urllib.parse.quote_plus(search_terms) + headers = { + 'Host': 'www.youtube.com', + 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)', + 'Accept': '*/*', + 'Accept-Language': 'en-US,en;q=0.5', + 'X-YouTube-Client-Name': '1', + 'X-YouTube-Client-Version': '2.20180418', + } + url += "&pbj=1&sp=" + page_number_to_sp_parameter(page, autocorrect, sort, filters).replace("=", "%3D") + content = requests.get(url, headers=headers).text + + info = json.loads(content) + videos = get_videos_from_search(info) + channels = get_channels_from_search(info) + + results = { + "videos": videos, + "channels": channels + } + return results + +def get_channels_from_search(search): + results = [] + search = search[1]['response'] + primaryContents = search['contents']['twoColumnSearchResultsRenderer']['primaryContents'] + items = primaryContents['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents'] + + for item in items: + try: + item['channelRenderer'] + channel = get_channel_renderer_item_info(item['channelRenderer']) + results.append(channel) + except KeyError: + continue + return results + +def get_channel_renderer_item_info(item): + try: + suscribers = item['subscriberCountText']['simpleText'].split(" ")[0] + except: + suscribers = "?" + + try: + description = get_description_snippet_text(item['descriptionSnippet']['runs']) + except KeyError: + description = "" + + try: + channel = { + "channelId": item['channelId'], + "username": item['title']['simpleText'], + "thumbnail": "https:{}".format(item['thumbnail']['thumbnails'][0]['url'].replace("/", "~")), + "description": Markup(str(description)), + "suscribers": suscribers, + "videos": item['videoCountText']['runs'][0]['text'] + } + except KeyError: + channel = { + "channelId": item['channelId'], + "username": item['title']['simpleText'], + "avatar": item['thumbnail']['thumbnails'][0]['url'], + "suscribers": suscribers + } + return channel + +def get_videos_from_search(search): + latest = [] + results = [] + search = search[1]['response'] + primaryContents = search['contents']['twoColumnSearchResultsRenderer']['primaryContents'] + items = primaryContents['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents'] + + for item in items: + try: + item['videoRenderer'] + video = get_video_renderer_item_info(item['videoRenderer']) + results.append(video) + except KeyError: + continue + + # Sometimes Youtube will return an empty query. Try again. + return results + +def get_description_snippet_text(ds): + string = "" + for t in ds: + try: + if t['bold']: + text = ""+t['text']+"" + else: + text = t['text'] + except: + text = t['text'] + string = string + text + return string + +def get_video_renderer_item_info(item): + published = "" + views = "" + isLive = False + isUpcoming = False + + thumbnailOverlays = item['thumbnailOverlays'] + try: + if 'UPCOMING' in str(thumbnailOverlays): + start_time = item['upcomingEventData']['startTime'] + isUpcoming = True + views = "-" + published = "Scheduled" + except KeyError: + isUpcoming = False + + try: + if 'LIVE' in str(thumbnailOverlays): + isLive = True + try: + views = item['viewCountText']['simpleText'] + except: + views = "Live" + try: + duration = item['lengthText']['simpleText'] + except: + duration = "-" + if published != "Scheduled": + try: + published = item['publishedTimeText']['simpleText'] + except KeyError: + published = "None" + except: + isUpcoming = False + isLive = False + + if not isUpcoming and not isLive: + views = item['viewCountText']['simpleText'] + published = item['publishedTimeText']['simpleText'] + duration = item['lengthText']['simpleText'] + + video = { + 'videoTitle':item['title']['runs'][0]['text'], + 'description':Markup(str(get_description_snippet_text(item['descriptionSnippet']['runs']))), + 'views':views, + 'timeStamp':published, + 'duration':duration, + 'channelName':item['ownerText']['runs'][0]['text'], + 'authorUrl':"/channel/{}".format(item['ownerText']['runs'][0]['navigationEndpoint']['browseEndpoint']['browseId']), + 'channelId':item['ownerText']['runs'][0]['navigationEndpoint']['browseEndpoint']['browseId'], + 'id':item['videoId'], + 'videoUrl':"/watch?v={}".format(item['videoId']), + 'isLive':isLive, + 'isUpcoming':isUpcoming, + 'videoThumb':item['thumbnail']['thumbnails'][0]['url'] + } + return video + diff --git a/youtube_data/videos.py b/youtube_data/videos.py index 5592486..653f0d8 100644 --- a/youtube_data/videos.py +++ b/youtube_data/videos.py @@ -11,7 +11,6 @@ def get_renderer_key(renderer, key): return k[key] def get_video_primary_info(datad, datai): - contents = datai["contents"]["twoColumnWatchNextResults"]['results']['results']['contents'] item = get_renderer_key(contents, "videoPrimaryInfoRenderer") details = datad['videoDetails']