print(response)

2020-10-12 12:39:19 +02:00 · 2020-10-12 12:39:19 +02:00 · 6fee62a491
commit 6fee62a491
parent 6c5ce51b26
3 changed files with 193 additions and 2 deletions
--- a/tw_data/feed.py
+++ b/tw_data/feed.py
@ -0,0 +1,75 @@
 from requests_futures.sessions import FuturesSession
 from werkzeug.datastructures import Headers
 from flask import Markup
 from concurrent.futures import as_completed
 from numerize import numerize
 from bs4 import BeautifulSoup
 from re import findall
 import time, datetime
 import requests
 import bleach
 import urllib
 import json
 import re
 NITTERINSTANCE = "https://nitter.net/"
 def get_feed(usernames, maxOld):
    '''
    Returns feed tweets given a set of usernames
    '''
    feedTweets = []
    with FuturesSession() as session:
        futures = [session.get('{instance}{user}'.format(instance=NITTERINSTANCE, user=u)) for u in usernames]
        for future in as_completed(futures):
            res = future.result().content.decode('utf-8')
            html = BeautifulSoup(res, "html.parser")
            userFeed = html.find_all('div', attrs={'class':'timeline-item'})
            if userFeed != []:
                    for post in userFeed[:-1]:
                        tweet = {}
                        date_time_str = post.find('span', attrs={'class':'tweet-date'}).find('a')['title'].replace(",","")
                        time = datetime.datetime.now() - datetime.datetime.strptime(date_time_str, '%d/%m/%Y %H:%M:%S')
                        if time.days >= maxOld:
                            continue
                        if post.find('div', attrs={'class':'pinned'}):
                            if post.find('div', attrs={'class':'pinned'}).find('span', attrs={'icon-pin'}):
                                continue
                        tweet['originalPoster'] = post.find('a', attrs={'class':'username'}).text
                        tweet['twitterName'] = post.find('a', attrs={'class':'fullname'}).text
                        tweet['timeStamp'] = datetime.datetime.strptime(date_time_str, '%d/%m/%Y %H:%M:%S')
                        tweet['date'] = post.find('span', attrs={'class':'tweet-date'}).find('a').text
                        tweet['content'] = Markup(post.find('div',  attrs={'class':'tweet-content'}))
                        if post.find('div', attrs={'class':'retweet-header'}):
                            tweet['username'] = post.find('div', attrs={'class':'retweet-header'}).find('div', attrs={'class':'icon-container'}).text
                            tweet['isRT'] = True
                        else:
                            tweet['username'] = tweet['originalPoster']
                            tweet['isRT'] = False
                        tweet['profilePic'] = NITTERINSTANCE+post.find('a', attrs={'class':'tweet-avatar'}).find('img')['src'][1:]
                        url = NITTERINSTANCE + post.find('a', attrs={'class':'tweet-link'})['href'][1:]
                        if post.find('div', attrs={'class':'quote'}):
                            tweet['isReply'] = True
                            tweet['quote'] = post.find('div', attrs={'class':'quote'})
                            if tweet['quote'].find('div',  attrs={'class':'quote-text'}):
                                tweet['replyingTweetContent'] = Markup(tweet['quote'].find('div',  attrs={'class':'quote-text'}))
                            if tweet['quote'].find('a', attrs={'class':'still-image'}):
                                tweet['replyAttachedImg'] = NITTERINSTANCE+tweet['quote'].find('a', attrs={'class':'still-image'})['href'][1:]
                            if tweet['quote'].find('div', attrs={'class':'unavailable-quote'}):
                                tweet['replyingUser']="Unavailable"
                            else:
                                tweet['replyingUser']=tweet['quote'].find('a',  attrs={'class':'username'}).text
                            post.find('div', attrs={'class':'quote'}).decompose()
                        if post.find('div',  attrs={'class':'attachments'}):
                            if not post.find(class_='quote'):
                                if  post.find('div',  attrs={'class':'attachments'}).find('a', attrs={'class':'still-image'}):
                                    attachedImg = NITTERINSTANCE + post.find('div',  attrs={'class':'attachments'}).find('a')['href'][1:]
                        feedTweets.append(tweet)
    return feedTweets
--- a/tw_data/user.py
+++ b/tw_data/user.py
@ -0,0 +1,116 @@
 from flask import Markup
 from requests_futures.sessions import FuturesSession
 from werkzeug.datastructures import Headers
 from concurrent.futures import as_completed
 from numerize import numerize
 from bs4 import BeautifulSoup
 from re import findall
 import time, datetime
 import requests
 import bleach
 import urllib
 import json
 import re
 ##########################
 #### Config variables ####
 ##########################
 NITTERINSTANCE = 'https://nitter.net/'
 def get_uer_info(username):
    response = urllib.request.urlopen('{instance}{user}'.format(instance=NITTERINSTANCE, user=username)).read()
    #rssFeed = feedparser.parse(response.content)
    html = BeautifulSoup(str(response), "lxml")
    if html.body.find('div', attrs={'class':'error-panel'}):
        return False
    else:
        html = html.body.find('div', attrs={'class':'profile-card'})
        if html.find('a', attrs={'class':'profile-card-fullname'}):
            fullName = html.find('a', attrs={'class':'profile-card-fullname'}).getText().encode('latin1').decode('unicode_escape').encode('latin1').decode('utf8')
        else:
            fullName = None
        if html.find('div', attrs={'class':'profile-bio'}):
            profileBio = html.find('div', attrs={'class':'profile-bio'}).getText().encode('latin1').decode('unicode_escape').encode('latin1').decode('utf8')
        else:
            profileBio = None
        user = {
            "profileFullName":fullName,
            "profileUsername":html.find('a', attrs={'class':'profile-card-username'}).string.encode('latin_1').decode('unicode_escape').encode('latin_1').decode('utf8'),
            "profileBio":profileBio,
            "tweets":html.find_all('span', attrs={'class':'profile-stat-num'})[0].string,
            "following":html.find_all('span', attrs={'class':'profile-stat-num'})[1].string,
            "followers":numerize.numerize(int(html.find_all('span', attrs={'class':'profile-stat-num'})[2].string.replace(",",""))),
            "likes":html.find_all('span', attrs={'class':'profile-stat-num'})[3].string,
            "profilePic":"{instance}{pic}".format(instance=NITTERINSTANCE, pic=html.find('a', attrs={'class':'profile-card-avatar'})['href'][1:])
        }
        return user
 def get_tweets(user, page=1):        
    feed = urllib.request.urlopen('{instance}{user}'.format(instance=NITTERINSTANCE, user=user)).read()
    #Gather feedPosts
    res = feed.decode('utf-8')
    html = BeautifulSoup(res, "html.parser")
    feedPosts = get_feed_tweets(html)
    if page == 2:
        nextPage = html.find('div', attrs={'class':'show-more'}).find('a')['href']
        print('{instance}{user}{page}'.format(instance=NITTERINSTANCE, user=user, page=nextPage))
        feed = urllib.request.urlopen('{instance}{user}{page}'.format(instance=NITTERINSTANCE, user=user, page=nextPage)).read()
        res = feed.decode('utf-8')
        html = BeautifulSoup(res, "html.parser")
        feedPosts = get_feed_tweets(html)
    return feedPosts
 def get_feed_tweets(html):
    feedPosts = []
    userFeed = html.find_all('div', attrs={'class':'timeline-item'})
    if userFeed != []:
        for post in userFeed[:-1]:
            if 'show-more' in str(post):
                continue
            date_time_str = post.find('span', attrs={'class':'tweet-date'}).find('a')['title'].replace(",","")
            if post.find('div', attrs={'class':'pinned'}):
                if post.find('div', attrs={'class':'pinned'}).find('span', attrs={'icon-pin'}):
                    continue
            tweet = {}
            tweet['op'] = post.find('a', attrs={'class':'username'}).text
            tweet['twitterName'] = post.find('a', attrs={'class':'fullname'}).text
            tweet['timeStamp'] = str(datetime.datetime.strptime(date_time_str, '%d/%m/%Y %H:%M:%S'))
            tweet['date'] = post.find('span', attrs={'class':'tweet-date'}).find('a').text
            tweet['content'] = Markup(post.find('div',  attrs={'class':'tweet-content'}).decode_contents())
            if post.find('div', attrs={'class':'retweet-header'}):
                tweet['username'] = post.find('div', attrs={'class':'retweet-header'}).find('div', attrs={'class':'icon-container'}).text
                tweet['isRT'] = True
            else:
                tweet['username'] = tweet['op']
                tweet['isRT'] = False
            tweet['profilePic'] = NITTERINSTANCE+post.find('a', attrs={'class':'tweet-avatar'}).find('img')['src'][1:]
            tweet['url'] = NITTERINSTANCE + post.find('a', attrs={'class':'tweet-link'})['href'][1:]
            if post.find('div', attrs={'class':'quote'}):
                tweet['isReply'] = True
                quote = post.find('div', attrs={'class':'quote'})
                if quote.find('div',  attrs={'class':'quote-text'}):
                    tweet['replyingTweetContent'] = Markup(quote.find('div',  attrs={'class':'quote-text'}))
                if quote.find('a', attrs={'class':'still-image'}):
                    tweet['replyAttachedImg'] = NITTERINSTANCE+quote.find('a', attrs={'class':'still-image'})['href'][1:]
                tweet['replyingUser']=quote.find('a',  attrs={'class':'username'}).text
                post.find('div', attrs={'class':'quote'}).decompose()
            if post.find('div',  attrs={'class':'attachments'}):
                if not post.find(class_='quote'):
                    if  post.find('div',  attrs={'class':'attachments'}).find('a', attrs={'class':'still-image'}):
                        tweet['attachedImg'] = NITTERINSTANCE + post.find('div',  attrs={'class':'attachments'}).find('a')['href'][1:]
            feedPosts.append(tweet)
    else:
        return {"emptyFeed": True}
    return feedPosts
--- a/youtube/util.py
+++ b/youtube/util.py
@ -147,7 +147,7 @@ def bypass_captcha(session, response, url, cookies):
        inputs['g-recaptcha-response'] = response['solution']['gRecaptchaResponse']
-        print(response['solution'])
+        print(response)
        # Print POST request headers
        print(requests.post("https://youtube.com/das_captcha", data=inputs,
                            headers={"Content-Type": "application/x-www-form-urlencoded",
@ -204,7 +204,7 @@ def fetch_url_response(url, headers=(), timeout=15, data=None,
        session = requests.Session()
        print("Starting python GET request to "+url+"...")
-        response = session.get(url, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; rv:78.0) Gecko/20100101 Firefox/78.0'})
+        response = session.get(url, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; rv:78.0) Gecko/20100101 Firefox/78.0', "Accept-Language": "en-US,en;q=0.5"})
        # Strings that appear when there's a Captcha.
        string_de = "Fülle das folgende Feld aus, um YouTube weiter zu nutzen."