Improved Youtube search

2020-09-11 10:34:24 +02:00 · 2020-09-11 10:34:24 +02:00 · a3f4d89781
commit a3f4d89781
parent 1e4c4c82e3
6 changed files with 345 additions and 49 deletions
--- a/app/routes.py
+++ b/app/routes.py
@ -7,7 +7,6 @@ from requests_futures.sessions import FuturesSession
 from werkzeug.datastructures import Headers
 from concurrent.futures import as_completed
 from werkzeug.utils import secure_filename
 from youtube_data import videos as ytvids
 from youtube_search import YoutubeSearch
 from werkzeug.urls import url_parse
 from youtube_dl import YoutubeDL
@ -24,6 +23,10 @@ import bleach
 import urllib
 import json
 import re
 #########################################
 from youtube_data import videos as ytvids
 from youtube_data import search as yts
 #########################################
 ##########################
 #### Config variables ####
 ##########################
@ -225,32 +228,14 @@ def ytsearch():
        channels = []
        videos = []
-        searchTerm = form.channelId.data
+        searchTerms = form.channelId.data
-        search = YoutubeSearch(searchTerm)
+        page = 1
-        chnns = search.channels_to_dict()
+        autocorrect = 1
-        vids = search.videos_to_dict()
+        sort = 0
-        
+        filters = {"time":0, "type":0, "duration":0}
-        for v in vids:
+        results = yts.search_by_terms(searchTerms, page, autocorrect, sort, filters)
            videos.append({
                'channelName':v['channel'],
                'videoTitle':v['title'],
                'description':Markup(v['long_desc']),
                'id':v['id'],
                'videoThumb': v['thumbnails'][-1],
                'channelUrl':v['url_suffix'],
                'channelId': v['channelId'],
                'views':v['views'],
                'timeStamp':v['publishedText']
            })
-        for c in chnns:
+        return render_template('ytsearch.html', form=form, btform=button_form, results=results, restricted=config['restrictPublicUsage'], config=config)
            channels.append({
                'username':c['name'],
                'channelId':c['id'],
                'thumbnail':'https:{}'.format(c['thumbnails'][0]),
                'subCount':c['suscriberCountText'].split(" ")[0]
            })
        return render_template('ytsearch.html', form=form, btform=button_form, channels=channels, videos=videos, restricted=config['restrictPublicUsage'], config=config)
    else:
        return render_template('ytsearch.html', form=form)
--- a/app/templates/_video_item.html
+++ b/app/templates/_video_item.html
@ -16,7 +16,7 @@
        </div>
    </div>
    <div class="extra content">
-        {% if video.views == "Livestream" %}
+        {% if video.isLive == "Livestream" or video.isLive %}
            <span class="right floated">
                <i class="red circle icon"></i>
                ‎‎{{video.views}}
@ -28,7 +28,7 @@
            </span>
        {% endif %}
-        {% if video.timeStamp == "Scheduled" %}
+        {% if video.timeStamp == "Scheduled" or video.isUpcoming %}
            <span class="right floated">
                <i class="blue clock icon"></i>
                {{video.timeStamp}}‎‎‎‎            ‎‎‎
--- a/app/templates/ytsearch.html
+++ b/app/templates/ytsearch.html
@ -14,37 +14,48 @@
        <p>{{ form.submit() }}</p>
    </form>
-    {% if channels %}
+    {% if results.channels %}
    <h3 class="ui dividing header">Users</h3>
            <div class="ui relaxed divided list">
-                    {% for res in channels %}
+                    {% for res in results.channels %}
                    <div class="item">
-                        {% if restricted or current_user.is_authenticated %}
+                        <div class="image">
-                        <div class="right floated content">
+                          <img src="{{ url_for('img', url=res.thumbnail) }}">
-                            {% if not current_user.is_following_yt(res.channelId) %}
+                        </div>
-                                <p>
+                        <div class="content">
                          <a class = "header" href="{{ url_for('channel', id=res.channelId)}}">{{res.username}}</a>
                          <div class="meta">
                            <span>{{res.description}}</span>
                          </div>
                          <div class="description">
                            <p></p>
                          </div>
                          <div class="extra">
                            <div class="ui label">
                                <i class="user icon"></i> {{res.suscribers}}
                            </div>
                            <div class="ui label">
                                <i class="video icon"></i> {{res.videos}}
                            </div>
                            {% if restricted or current_user.is_authenticated %}
                                <div class="right floated content">
                                {% if not current_user.is_following_yt(res.channelId) %}
                                    <form action="{{ url_for('ytfollow', channelId=res.channelId) }}" method="post">
                                        {{ btform.hidden_tag() }}
                                        {{ btform.submit(value='Follow') }}
                                    </form>
-                                </p>
+                                {% else %}
                            {% else %}
                                <p>
                                    <form action="{{ url_for('ytunfollow', channelId=res.channelId) }}" method="post">
                                        {{ btform.hidden_tag() }}
                                        {{ btform.submit(value='Unfollow') }}
                                    </form>
-                                </p>
+                                {% endif %}
                                </div>  
                            {% endif %}
-                        </div>
+                          </div>
                        {% endif %}
                        <img alt="Avatar" class="ui avatar image" src="{{ res.thumbnail }}">
                        <div class="content">
                            <a class = "header" href="{{ url_for('channel', id=res.channelId)}}">{{res.username}}</a>
                            <div class="description"><div class="ui label">
                                <i class="user icon"></i> {{res.subCount}}
                            </div></div>
                        </div>
                    </div>
                    {% endfor %}
@ -53,10 +64,10 @@
            <div class="ui middle aligned divided list">
-                {% if videos %}
+                {% if results.videos %}
                <h3 class="ui dividing header">Videos</h3>
                    <div class="ui centered cards">
-                        {% for video in videos %}
+                        {% for video in results.videos %}
                            {% include '_video_item.html' %}
                        {% endfor %}
                    </div>
--- a/youtube_data/proto.py
+++ b/youtube_data/proto.py
@ -0,0 +1,130 @@
 from math import ceil
 import base64
 import io
 # FROM https://github.com/user234683/youtube-local/blob/master/youtube/proto.py
 def byte(n):
    return bytes((n,))
 def varint_encode(offset):
    '''In this encoding system, for each 8-bit byte, the first bit is 1 if there are more bytes, and 0 is this is the last one.
    The next 7 bits are data. These 7-bit sections represent the data in Little endian order. For example, suppose the data is
    aaaaaaabbbbbbbccccccc (each of these sections is 7 bits). It will be encoded as:
    1ccccccc 1bbbbbbb 0aaaaaaa
    This encoding is used in youtube parameters to encode offsets and to encode the length for length-prefixed data.
    See https://developers.google.com/protocol-buffers/docs/encoding#varints for more info.'''
    needed_bytes = ceil(offset.bit_length()/7) or 1 # (0).bit_length() returns 0, but we need 1 in that case.
    encoded_bytes = bytearray(needed_bytes)
    for i in range(0, needed_bytes - 1):
        encoded_bytes[i] = (offset & 127) | 128  # 7 least significant bits
        offset = offset >> 7
    encoded_bytes[-1] = offset & 127 # leave first bit as zero for last byte
    return bytes(encoded_bytes)
 def varint_decode(encoded):
    decoded = 0
    for i, byte in enumerate(encoded):
        decoded |= (byte & 127) << 7*i
        if not (byte & 128):
            break
    return decoded
 def string(field_number, data):
    data = as_bytes(data)
    return _proto_field(2, field_number, varint_encode(len(data)) + data)
 nested = string
 def uint(field_number, value):
    return _proto_field(0, field_number, varint_encode(value))
 def _proto_field(wire_type, field_number, data):
    ''' See https://developers.google.com/protocol-buffers/docs/encoding#structure '''
    return varint_encode( (field_number << 3) | wire_type) + data
 def percent_b64encode(data):
    return base64.urlsafe_b64encode(data).replace(b'=', b'%3D')
 def unpadded_b64encode(data):
    return base64.urlsafe_b64encode(data).replace(b'=', b'')
 def as_bytes(value):
    if isinstance(value, str):
        return value.encode('utf-8')
    return value
 def read_varint(data):
    result = 0
    i = 0
    while True:
        try:
            byte = data.read(1)[0]
        except IndexError:
            if i == 0:
                raise EOFError()
            raise Exception('Unterminated varint starting at ' + str(data.tell() - i))
        result |= (byte & 127) << 7*i
        if not byte & 128:
            break
        i += 1
    return result
 def read_group(data, end_sequence):
    start = data.tell()
    index = data.original.find(end_sequence, start)
    if index == -1:
        raise Exception('Unterminated group')
    data.seek(index + len(end_sequence))
    return data.original[start:index]
 def read_protobuf(data):
    data_original = data
    data = io.BytesIO(data)
    data.original = data_original
    while True:
        try:
            tag = read_varint(data)
        except EOFError:
            break
        wire_type = tag & 7
        field_number = tag >> 3
        if wire_type == 0:
            value = read_varint(data)
        elif wire_type == 1:
            value = data.read(8)
        elif wire_type == 2:
            length = read_varint(data)
            value = data.read(length)
        elif wire_type == 3:
            end_bytes = encode_varint((field_number << 3) | 4)
            value = read_group(data, end_bytes)
        elif wire_type == 5:
            value = data.read(4)
        else:
            raise Exception("Unknown wire type: " + str(wire_type) + ", Tag: " + bytes_to_hex(succinct_encode(tag)) + ", at position " + str(data.tell()))
        yield (wire_type, field_number, value)
 def parse(data):
    return {field_number: value for _, field_number, value in read_protobuf(data)}
 def b64_to_bytes(data):
    if isinstance(data, bytes):
        data = data.decode('ascii')
    data = data.replace("%3D", "=")
    return base64.urlsafe_b64decode(data + "="*((4 - len(data)%4)%4) )
--- a/youtube_data/search.py
+++ b/youtube_data/search.py
@ -0,0 +1,171 @@
 from bs4 import BeautifulSoup as bs
 from youtube_data import proto
 from flask import Markup
 import urllib.parse
 import requests
 import base64
 import json
 def page_number_to_sp_parameter(page, autocorrect, sort, filters):
    offset = (int(page) - 1)*20    # 20 results per page
    autocorrect = proto.nested(8, proto.uint(1, 1 - int(autocorrect) ))
    filters_enc = proto.nested(2, proto.uint(1, filters['time']) + proto.uint(2, filters['type']) + proto.uint(3, filters['duration']))
    result = proto.uint(1, sort) + filters_enc + autocorrect + proto.uint(9, offset) + proto.string(61, b'')
    return base64.urlsafe_b64encode(result).decode('ascii')
 def search_by_terms(search_terms, page, autocorrect, sort, filters):
    url = "https://www.youtube.com/results?search_query=" + urllib.parse.quote_plus(search_terms)
    headers = {
        'Host': 'www.youtube.com',
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)',
        'Accept': '*/*',
        'Accept-Language': 'en-US,en;q=0.5',
        'X-YouTube-Client-Name': '1',
        'X-YouTube-Client-Version': '2.20180418',
    }
    url += "&pbj=1&sp=" + page_number_to_sp_parameter(page, autocorrect, sort, filters).replace("=", "%3D")
    content = requests.get(url, headers=headers).text
    info = json.loads(content)
    videos = get_videos_from_search(info)
    channels = get_channels_from_search(info)
    results = {
        "videos": videos,
        "channels": channels
    }
    return results
 def get_channels_from_search(search):
    results = []
    search = search[1]['response']
    primaryContents = search['contents']['twoColumnSearchResultsRenderer']['primaryContents']
    items = primaryContents['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents']
    for item in items:
        try:
            item['channelRenderer']
            channel = get_channel_renderer_item_info(item['channelRenderer'])
            results.append(channel)
        except KeyError:
            continue
    return results
 def get_channel_renderer_item_info(item):
    try:
        suscribers = item['subscriberCountText']['simpleText'].split(" ")[0]
    except:
        suscribers = "?"
    try:
        description = get_description_snippet_text(item['descriptionSnippet']['runs'])
    except KeyError:
        description = ""
    try:
        channel = {
            "channelId": item['channelId'],
            "username": item['title']['simpleText'],
            "thumbnail": "https:{}".format(item['thumbnail']['thumbnails'][0]['url'].replace("/", "~")),
            "description": Markup(str(description)),
            "suscribers": suscribers,
            "videos": item['videoCountText']['runs'][0]['text']
        }
    except KeyError:
        channel = {
            "channelId": item['channelId'],
            "username": item['title']['simpleText'],
            "avatar": item['thumbnail']['thumbnails'][0]['url'],
            "suscribers": suscribers
        }
    return channel
 def get_videos_from_search(search):
    latest = []
    results = []
    search = search[1]['response']
    primaryContents = search['contents']['twoColumnSearchResultsRenderer']['primaryContents']
    items = primaryContents['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents']
    for item in items:
        try:
            item['videoRenderer']
            video = get_video_renderer_item_info(item['videoRenderer'])
            results.append(video)
        except KeyError:
            continue
    # Sometimes Youtube will return an empty query. Try again.        
    return results
 def get_description_snippet_text(ds):
    string = ""
    for t in ds:
        try:
            if t['bold']:
                text = "<b>"+t['text']+"</b>"
            else:
                text = t['text']
        except:
            text = t['text']
        string = string + text
    return string
 def get_video_renderer_item_info(item):
    published = ""
    views = ""
    isLive = False
    isUpcoming = False
    thumbnailOverlays = item['thumbnailOverlays']
    try:
        if 'UPCOMING' in str(thumbnailOverlays):
            start_time = item['upcomingEventData']['startTime']
            isUpcoming = True
            views = "-"
            published = "Scheduled"
    except KeyError:
        isUpcoming = False
    try:
        if 'LIVE' in str(thumbnailOverlays):
            isLive = True
            try:
                views = item['viewCountText']['simpleText']
            except:
                views = "Live"
            try:
                duration = item['lengthText']['simpleText']
            except:
                duration = "-"
            if published != "Scheduled":
                try:
                    published = item['publishedTimeText']['simpleText']
                except KeyError:
                    published = "None"
    except:
        isUpcoming = False
        isLive = False
    if not isUpcoming and not isLive:
        views = item['viewCountText']['simpleText']
        published = item['publishedTimeText']['simpleText']
        duration = item['lengthText']['simpleText']
    video = {
        'videoTitle':item['title']['runs'][0]['text'],
        'description':Markup(str(get_description_snippet_text(item['descriptionSnippet']['runs']))),
        'views':views,
        'timeStamp':published,
        'duration':duration,
        'channelName':item['ownerText']['runs'][0]['text'],
        'authorUrl':"/channel/{}".format(item['ownerText']['runs'][0]['navigationEndpoint']['browseEndpoint']['browseId']),
        'channelId':item['ownerText']['runs'][0]['navigationEndpoint']['browseEndpoint']['browseId'],
        'id':item['videoId'],
        'videoUrl':"/watch?v={}".format(item['videoId']),
        'isLive':isLive,
        'isUpcoming':isUpcoming,
        'videoThumb':item['thumbnail']['thumbnails'][0]['url']
    }
    return video
--- a/youtube_data/videos.py
+++ b/youtube_data/videos.py
@ -11,7 +11,6 @@ def get_renderer_key(renderer, key):
            return k[key]
 def get_video_primary_info(datad, datai):
    contents = datai["contents"]["twoColumnWatchNextResults"]['results']['results']['contents']
    item = get_renderer_key(contents, "videoPrimaryInfoRenderer")
    details = datad['videoDetails']