Improved Youtube search

2020-09-11 10:34:24 +02:00 · 2020-09-11 10:34:24 +02:00 · a3f4d89781
commit a3f4d89781
parent 1e4c4c82e3
6 changed files with 345 additions and 49 deletions
--- a/app/routes.py
+++ b/app/routes.py
@ -7,7 +7,6 @@ from requests_futures.sessions import FuturesSession
 from werkzeug.datastructures import Headers
 from concurrent.futures import as_completed
 from werkzeug.utils import secure_filename
-from youtube_data import videos as ytvids
 from youtube_search import YoutubeSearch
 from werkzeug.urls import url_parse
 from youtube_dl import YoutubeDL
@ -24,6 +23,10 @@ import bleach
 import urllib
 import json
 import re
+#########################################
+from youtube_data import videos as ytvids
+from youtube_data import search as yts
+#########################################
 ##########################
 #### Config variables ####
 ##########################
@ -225,32 +228,14 @@ def ytsearch():
        channels = []
        videos = []

-        searchTerm = form.channelId.data
-        search = YoutubeSearch(searchTerm)
-        chnns = search.channels_to_dict()
-        vids = search.videos_to_dict()
-        
-        for v in vids:
-            videos.append({
-                'channelName':v['channel'],
-                'videoTitle':v['title'],
-                'description':Markup(v['long_desc']),
-                'id':v['id'],
-                'videoThumb': v['thumbnails'][-1],
-                'channelUrl':v['url_suffix'],
-                'channelId': v['channelId'],
-                'views':v['views'],
-                'timeStamp':v['publishedText']
-            })
+        searchTerms = form.channelId.data
+        page = 1
+        autocorrect = 1
+        sort = 0
+        filters = {"time":0, "type":0, "duration":0}
+        results = yts.search_by_terms(searchTerms, page, autocorrect, sort, filters)

-        for c in chnns:
-            channels.append({
-                'username':c['name'],
-                'channelId':c['id'],
-                'thumbnail':'https:{}'.format(c['thumbnails'][0]),
-                'subCount':c['suscriberCountText'].split(" ")[0]
-            })
-        return render_template('ytsearch.html', form=form, btform=button_form, channels=channels, videos=videos, restricted=config['restrictPublicUsage'], config=config)
+        return render_template('ytsearch.html', form=form, btform=button_form, results=results, restricted=config['restrictPublicUsage'], config=config)

    else:
        return render_template('ytsearch.html', form=form)
--- a/app/templates/_video_item.html
+++ b/app/templates/_video_item.html
@ -16,7 +16,7 @@
        </div>
    </div>
    <div class="extra content">
-        {% if video.views == "Livestream" %}
+        {% if video.isLive == "Livestream" or video.isLive %}
            <span class="right floated">
                <i class="red circle icon"></i>
                ‎‎{{video.views}}
@ -28,7 +28,7 @@
            </span>
        {% endif %}

-        {% if video.timeStamp == "Scheduled" %}
+        {% if video.timeStamp == "Scheduled" or video.isUpcoming %}
            <span class="right floated">
                <i class="blue clock icon"></i>
                {{video.timeStamp}}‎‎‎‎            ‎‎‎
--- a/app/templates/ytsearch.html
+++ b/app/templates/ytsearch.html
@ -14,37 +14,48 @@
        <p>{{ form.submit() }}</p>
    </form>

-    {% if channels %}
+    {% if results.channels %}
    <h3 class="ui dividing header">Users</h3>
            <div class="ui relaxed divided list">

-                    {% for res in channels %}
+                    {% for res in results.channels %}
                    <div class="item">
-                        {% if restricted or current_user.is_authenticated %}
-                        <div class="right floated content">
-                            {% if not current_user.is_following_yt(res.channelId) %}
-                                <p>
+                        <div class="image">
+                          <img src="{{ url_for('img', url=res.thumbnail) }}">
+                        </div>
+                        <div class="content">
+                          <a class = "header" href="{{ url_for('channel', id=res.channelId)}}">{{res.username}}</a>
+                          <div class="meta">
+                            <span>{{res.description}}</span>
+                          </div>
+                          <div class="description">
+                            <p></p>
+                          </div>
+                          <div class="extra">
+                            <div class="ui label">
+                                <i class="user icon"></i> {{res.suscribers}}
+                            </div>
+
+                            <div class="ui label">
+                                <i class="video icon"></i> {{res.videos}}
+                            </div>
+                            
+                            {% if restricted or current_user.is_authenticated %}
+                                <div class="right floated content">
+                                {% if not current_user.is_following_yt(res.channelId) %}
                                    <form action="{{ url_for('ytfollow', channelId=res.channelId) }}" method="post">
                                        {{ btform.hidden_tag() }}
                                        {{ btform.submit(value='Follow') }}
                                    </form>
-                                </p>
-                            {% else %}
-                                <p>
+                                {% else %}
                                    <form action="{{ url_for('ytunfollow', channelId=res.channelId) }}" method="post">
                                        {{ btform.hidden_tag() }}
                                        {{ btform.submit(value='Unfollow') }}
                                    </form>
-                                </p>
+                                {% endif %}
+                                </div>  
                            {% endif %}
-                        </div>
-                        {% endif %}
-                        <img alt="Avatar" class="ui avatar image" src="{{ res.thumbnail }}">
-                        <div class="content">
-                            <a class = "header" href="{{ url_for('channel', id=res.channelId)}}">{{res.username}}</a>
-                            <div class="description"><div class="ui label">
-                                <i class="user icon"></i> {{res.subCount}}
-                            </div></div>
+                          </div>
                        </div>
                    </div>
                    {% endfor %}
@ -53,10 +64,10 @@


            <div class="ui middle aligned divided list">
-                {% if videos %}
+                {% if results.videos %}
                <h3 class="ui dividing header">Videos</h3>
                    <div class="ui centered cards">
-                        {% for video in videos %}
+                        {% for video in results.videos %}
                            {% include '_video_item.html' %}
                        {% endfor %}
                    </div>
--- a/youtube_data/proto.py
+++ b/youtube_data/proto.py
@ -0,0 +1,130 @@
+from math import ceil
+import base64
+import io
+
+# FROM https://github.com/user234683/youtube-local/blob/master/youtube/proto.py
+
+def byte(n):
+    return bytes((n,))
+
+    
+def varint_encode(offset):
+    '''In this encoding system, for each 8-bit byte, the first bit is 1 if there are more bytes, and 0 is this is the last one.
+    The next 7 bits are data. These 7-bit sections represent the data in Little endian order. For example, suppose the data is
+    aaaaaaabbbbbbbccccccc (each of these sections is 7 bits). It will be encoded as:
+    1ccccccc 1bbbbbbb 0aaaaaaa
+    
+    This encoding is used in youtube parameters to encode offsets and to encode the length for length-prefixed data.
+    See https://developers.google.com/protocol-buffers/docs/encoding#varints for more info.'''
+    needed_bytes = ceil(offset.bit_length()/7) or 1 # (0).bit_length() returns 0, but we need 1 in that case.
+    encoded_bytes = bytearray(needed_bytes)
+    for i in range(0, needed_bytes - 1):
+        encoded_bytes[i] = (offset & 127) | 128  # 7 least significant bits
+        offset = offset >> 7
+    encoded_bytes[-1] = offset & 127 # leave first bit as zero for last byte
+    
+    return bytes(encoded_bytes)
+
+    
+def varint_decode(encoded):
+    decoded = 0
+    for i, byte in enumerate(encoded):
+        decoded |= (byte & 127) << 7*i
+        
+        if not (byte & 128):
+            break
+    return decoded
+
+    
+def string(field_number, data):
+    data = as_bytes(data)
+    return _proto_field(2, field_number, varint_encode(len(data)) + data)
+nested = string
+
+def uint(field_number, value):
+    return _proto_field(0, field_number, varint_encode(value))
+    
+
+    
+    
+def _proto_field(wire_type, field_number, data):
+    ''' See https://developers.google.com/protocol-buffers/docs/encoding#structure '''
+    return varint_encode( (field_number << 3) | wire_type) + data
+
+
+    
+def percent_b64encode(data):
+    return base64.urlsafe_b64encode(data).replace(b'=', b'%3D')
+    
+    
+def unpadded_b64encode(data):
+    return base64.urlsafe_b64encode(data).replace(b'=', b'')
+
+def as_bytes(value):
+    if isinstance(value, str):
+        return value.encode('utf-8')
+    return value
+
+
+def read_varint(data):
+    result = 0
+    i = 0
+    while True:
+        try:
+            byte = data.read(1)[0]
+        except IndexError:
+            if i == 0:
+                raise EOFError()
+            raise Exception('Unterminated varint starting at ' + str(data.tell() - i))
+        result |= (byte & 127) << 7*i
+        if not byte & 128:
+            break
+
+        i += 1
+    return result
+
+                                
+def read_group(data, end_sequence):
+    start = data.tell()
+    index = data.original.find(end_sequence, start)
+    if index == -1:
+        raise Exception('Unterminated group')
+    data.seek(index + len(end_sequence))
+    return data.original[start:index]
+
+def read_protobuf(data):
+    data_original = data
+    data = io.BytesIO(data)
+    data.original = data_original
+    while True:
+        try:
+            tag = read_varint(data)
+        except EOFError:
+            break
+        wire_type = tag & 7
+        field_number = tag >> 3
+        
+        if wire_type == 0:
+            value = read_varint(data)
+        elif wire_type == 1:
+            value = data.read(8)
+        elif wire_type == 2:
+            length = read_varint(data)
+            value = data.read(length)
+        elif wire_type == 3:
+            end_bytes = encode_varint((field_number << 3) | 4)
+            value = read_group(data, end_bytes)
+        elif wire_type == 5:
+            value = data.read(4)
+        else:
+            raise Exception("Unknown wire type: " + str(wire_type) + ", Tag: " + bytes_to_hex(succinct_encode(tag)) + ", at position " + str(data.tell()))
+        yield (wire_type, field_number, value)
+
+def parse(data):
+    return {field_number: value for _, field_number, value in read_protobuf(data)}
+
+def b64_to_bytes(data):
+    if isinstance(data, bytes):
+        data = data.decode('ascii')
+    data = data.replace("%3D", "=")
+    return base64.urlsafe_b64decode(data + "="*((4 - len(data)%4)%4) )
--- a/youtube_data/search.py
+++ b/youtube_data/search.py
@ -0,0 +1,171 @@
+from bs4 import BeautifulSoup as bs
+from youtube_data import proto
+from flask import Markup
+import urllib.parse
+import requests
+import base64
+import json
+
+def page_number_to_sp_parameter(page, autocorrect, sort, filters):
+    offset = (int(page) - 1)*20    # 20 results per page
+    autocorrect = proto.nested(8, proto.uint(1, 1 - int(autocorrect) ))
+    filters_enc = proto.nested(2, proto.uint(1, filters['time']) + proto.uint(2, filters['type']) + proto.uint(3, filters['duration']))
+    result = proto.uint(1, sort) + filters_enc + autocorrect + proto.uint(9, offset) + proto.string(61, b'')
+    return base64.urlsafe_b64encode(result).decode('ascii')
+
+def search_by_terms(search_terms, page, autocorrect, sort, filters):
+    url = "https://www.youtube.com/results?search_query=" + urllib.parse.quote_plus(search_terms)
+    headers = {
+        'Host': 'www.youtube.com',
+        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)',
+        'Accept': '*/*',
+        'Accept-Language': 'en-US,en;q=0.5',
+        'X-YouTube-Client-Name': '1',
+        'X-YouTube-Client-Version': '2.20180418',
+    }
+    url += "&pbj=1&sp=" + page_number_to_sp_parameter(page, autocorrect, sort, filters).replace("=", "%3D")
+    content = requests.get(url, headers=headers).text
+
+    info = json.loads(content)
+    videos = get_videos_from_search(info)
+    channels = get_channels_from_search(info)
+
+    results = {
+        "videos": videos,
+        "channels": channels
+    }
+    return results
+
+def get_channels_from_search(search):
+    results = []
+    search = search[1]['response']
+    primaryContents = search['contents']['twoColumnSearchResultsRenderer']['primaryContents']
+    items = primaryContents['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents']
+
+    for item in items:
+        try:
+            item['channelRenderer']
+            channel = get_channel_renderer_item_info(item['channelRenderer'])
+            results.append(channel)
+        except KeyError:
+            continue
+    return results
+
+def get_channel_renderer_item_info(item):
+    try:
+        suscribers = item['subscriberCountText']['simpleText'].split(" ")[0]
+    except:
+        suscribers = "?"
+    
+    try:
+        description = get_description_snippet_text(item['descriptionSnippet']['runs'])
+    except KeyError:
+        description = ""
+
+    try:
+        channel = {
+            "channelId": item['channelId'],
+            "username": item['title']['simpleText'],
+            "thumbnail": "https:{}".format(item['thumbnail']['thumbnails'][0]['url'].replace("/", "~")),
+            "description": Markup(str(description)),
+            "suscribers": suscribers,
+            "videos": item['videoCountText']['runs'][0]['text']
+        }
+    except KeyError:
+        channel = {
+            "channelId": item['channelId'],
+            "username": item['title']['simpleText'],
+            "avatar": item['thumbnail']['thumbnails'][0]['url'],
+            "suscribers": suscribers
+        }
+    return channel
+
+def get_videos_from_search(search):
+    latest = []
+    results = []
+    search = search[1]['response']
+    primaryContents = search['contents']['twoColumnSearchResultsRenderer']['primaryContents']
+    items = primaryContents['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents']
+
+    for item in items:
+        try:
+            item['videoRenderer']
+            video = get_video_renderer_item_info(item['videoRenderer'])
+            results.append(video)
+        except KeyError:
+            continue
+
+    # Sometimes Youtube will return an empty query. Try again.        
+    return results
+
+def get_description_snippet_text(ds):
+    string = ""
+    for t in ds:
+        try:
+            if t['bold']:
+                text = "<b>"+t['text']+"</b>"
+            else:
+                text = t['text']
+        except:
+            text = t['text']
+        string = string + text
+    return string
+
+def get_video_renderer_item_info(item):
+    published = ""
+    views = ""
+    isLive = False
+    isUpcoming = False
+
+    thumbnailOverlays = item['thumbnailOverlays']
+    try:
+        if 'UPCOMING' in str(thumbnailOverlays):
+            start_time = item['upcomingEventData']['startTime']
+            isUpcoming = True
+            views = "-"
+            published = "Scheduled"
+    except KeyError:
+        isUpcoming = False
+
+    try:
+        if 'LIVE' in str(thumbnailOverlays):
+            isLive = True
+            try:
+                views = item['viewCountText']['simpleText']
+            except:
+                views = "Live"
+            try:
+                duration = item['lengthText']['simpleText']
+            except:
+                duration = "-"
+            if published != "Scheduled":
+                try:
+                    published = item['publishedTimeText']['simpleText']
+                except KeyError:
+                    published = "None"
+    except:
+        isUpcoming = False
+        isLive = False
+
+    if not isUpcoming and not isLive:
+        views = item['viewCountText']['simpleText']
+        published = item['publishedTimeText']['simpleText']
+        duration = item['lengthText']['simpleText']
+
+    video = {
+        'videoTitle':item['title']['runs'][0]['text'],
+        'description':Markup(str(get_description_snippet_text(item['descriptionSnippet']['runs']))),
+        'views':views,
+        'timeStamp':published,
+        'duration':duration,
+        'channelName':item['ownerText']['runs'][0]['text'],
+        'authorUrl':"/channel/{}".format(item['ownerText']['runs'][0]['navigationEndpoint']['browseEndpoint']['browseId']),
+        'channelId':item['ownerText']['runs'][0]['navigationEndpoint']['browseEndpoint']['browseId'],
+        'id':item['videoId'],
+        'videoUrl':"/watch?v={}".format(item['videoId']),
+        'isLive':isLive,
+        'isUpcoming':isUpcoming,
+        'videoThumb':item['thumbnail']['thumbnails'][0]['url']
+    }
+    return video
+
--- a/youtube_data/videos.py
+++ b/youtube_data/videos.py
@ -11,7 +11,6 @@ def get_renderer_key(renderer, key):
            return k[key]

 def get_video_primary_info(datad, datai):
-
    contents = datai["contents"]["twoColumnWatchNextResults"]['results']['results']['contents']
    item = get_renderer_key(contents, "videoPrimaryInfoRenderer")
    details = datad['videoDetails']