print(response)

2020-10-12 12:39:19 +02:00 · 2020-10-12 12:39:19 +02:00 · 6fee62a491
commit 6fee62a491
parent 6c5ce51b26
3 changed files with 193 additions and 2 deletions
--- a/tw_data/feed.py
+++ b/tw_data/feed.py
@ -0,0 +1,75 @@
+from requests_futures.sessions import FuturesSession
+from werkzeug.datastructures import Headers
+from flask import Markup
+from concurrent.futures import as_completed
+from numerize import numerize
+from bs4 import BeautifulSoup
+from re import findall
+import time, datetime
+import requests
+import bleach
+import urllib
+import json
+import re
+
+NITTERINSTANCE = "https://nitter.net/"
+
+def get_feed(usernames, maxOld):
+    '''
+    Returns feed tweets given a set of usernames
+    '''
+    feedTweets = []
+    with FuturesSession() as session:
+        futures = [session.get('{instance}{user}'.format(instance=NITTERINSTANCE, user=u)) for u in usernames]
+        for future in as_completed(futures):
+            res = future.result().content.decode('utf-8')
+            html = BeautifulSoup(res, "html.parser")
+            userFeed = html.find_all('div', attrs={'class':'timeline-item'})
+            if userFeed != []:
+                    for post in userFeed[:-1]:
+                        tweet = {}
+                        date_time_str = post.find('span', attrs={'class':'tweet-date'}).find('a')['title'].replace(",","")
+                        time = datetime.datetime.now() - datetime.datetime.strptime(date_time_str, '%d/%m/%Y %H:%M:%S')
+                        if time.days >= maxOld:
+                            continue
+
+                        if post.find('div', attrs={'class':'pinned'}):
+                            if post.find('div', attrs={'class':'pinned'}).find('span', attrs={'icon-pin'}):
+                                continue
+                        
+                        tweet['originalPoster'] = post.find('a', attrs={'class':'username'}).text
+                        tweet['twitterName'] = post.find('a', attrs={'class':'fullname'}).text
+                        tweet['timeStamp'] = datetime.datetime.strptime(date_time_str, '%d/%m/%Y %H:%M:%S')
+                        tweet['date'] = post.find('span', attrs={'class':'tweet-date'}).find('a').text
+                        tweet['content'] = Markup(post.find('div',  attrs={'class':'tweet-content'}))
+                        
+                        if post.find('div', attrs={'class':'retweet-header'}):
+                            tweet['username'] = post.find('div', attrs={'class':'retweet-header'}).find('div', attrs={'class':'icon-container'}).text
+                            tweet['isRT'] = True
+                        else:
+                            tweet['username'] = tweet['originalPoster']
+                            tweet['isRT'] = False
+                        
+                        tweet['profilePic'] = NITTERINSTANCE+post.find('a', attrs={'class':'tweet-avatar'}).find('img')['src'][1:]
+                        url = NITTERINSTANCE + post.find('a', attrs={'class':'tweet-link'})['href'][1:]
+                        if post.find('div', attrs={'class':'quote'}):
+                            tweet['isReply'] = True
+                            tweet['quote'] = post.find('div', attrs={'class':'quote'})
+                            if tweet['quote'].find('div',  attrs={'class':'quote-text'}):
+                                tweet['replyingTweetContent'] = Markup(tweet['quote'].find('div',  attrs={'class':'quote-text'}))
+                                
+                            if tweet['quote'].find('a', attrs={'class':'still-image'}):
+                                tweet['replyAttachedImg'] = NITTERINSTANCE+tweet['quote'].find('a', attrs={'class':'still-image'})['href'][1:]
+                            
+                            if tweet['quote'].find('div', attrs={'class':'unavailable-quote'}):
+                                tweet['replyingUser']="Unavailable"
+                            else:
+                                tweet['replyingUser']=tweet['quote'].find('a',  attrs={'class':'username'}).text
+                            post.find('div', attrs={'class':'quote'}).decompose()
+
+                        if post.find('div',  attrs={'class':'attachments'}):
+                            if not post.find(class_='quote'):
+                                if  post.find('div',  attrs={'class':'attachments'}).find('a', attrs={'class':'still-image'}):
+                                    attachedImg = NITTERINSTANCE + post.find('div',  attrs={'class':'attachments'}).find('a')['href'][1:]
+                        feedTweets.append(tweet)
+    return feedTweets
--- a/tw_data/user.py
+++ b/tw_data/user.py
@ -0,0 +1,116 @@
+from flask import Markup
+from requests_futures.sessions import FuturesSession
+from werkzeug.datastructures import Headers
+from concurrent.futures import as_completed
+from numerize import numerize
+from bs4 import BeautifulSoup
+from re import findall
+import time, datetime
+import requests
+import bleach
+import urllib
+import json
+import re
+
+##########################
+#### Config variables ####
+##########################
+NITTERINSTANCE = 'https://nitter.net/'
+
+def get_uer_info(username):
+    response = urllib.request.urlopen('{instance}{user}'.format(instance=NITTERINSTANCE, user=username)).read()
+    #rssFeed = feedparser.parse(response.content)
+
+    html = BeautifulSoup(str(response), "lxml")
+    if html.body.find('div', attrs={'class':'error-panel'}):
+        return False
+    else:
+        html = html.body.find('div', attrs={'class':'profile-card'})
+
+        if html.find('a', attrs={'class':'profile-card-fullname'}):
+            fullName = html.find('a', attrs={'class':'profile-card-fullname'}).getText().encode('latin1').decode('unicode_escape').encode('latin1').decode('utf8')
+        else:
+            fullName = None
+        
+        if html.find('div', attrs={'class':'profile-bio'}):
+            profileBio = html.find('div', attrs={'class':'profile-bio'}).getText().encode('latin1').decode('unicode_escape').encode('latin1').decode('utf8')
+        else:
+            profileBio = None
+
+        user = {
+            "profileFullName":fullName,
+            "profileUsername":html.find('a', attrs={'class':'profile-card-username'}).string.encode('latin_1').decode('unicode_escape').encode('latin_1').decode('utf8'),
+            "profileBio":profileBio,
+            "tweets":html.find_all('span', attrs={'class':'profile-stat-num'})[0].string,
+            "following":html.find_all('span', attrs={'class':'profile-stat-num'})[1].string,
+            "followers":numerize.numerize(int(html.find_all('span', attrs={'class':'profile-stat-num'})[2].string.replace(",",""))),
+            "likes":html.find_all('span', attrs={'class':'profile-stat-num'})[3].string,
+            "profilePic":"{instance}{pic}".format(instance=NITTERINSTANCE, pic=html.find('a', attrs={'class':'profile-card-avatar'})['href'][1:])
+        }
+        return user
+
+def get_tweets(user, page=1):        
+    feed = urllib.request.urlopen('{instance}{user}'.format(instance=NITTERINSTANCE, user=user)).read()
+    #Gather feedPosts
+    res = feed.decode('utf-8')
+    html = BeautifulSoup(res, "html.parser")
+    feedPosts = get_feed_tweets(html)
+
+    if page == 2:
+        nextPage = html.find('div', attrs={'class':'show-more'}).find('a')['href']
+        print('{instance}{user}{page}'.format(instance=NITTERINSTANCE, user=user, page=nextPage))
+        feed = urllib.request.urlopen('{instance}{user}{page}'.format(instance=NITTERINSTANCE, user=user, page=nextPage)).read()
+        res = feed.decode('utf-8')
+        html = BeautifulSoup(res, "html.parser")
+        feedPosts = get_feed_tweets(html)
+    return feedPosts
+
+def get_feed_tweets(html):
+    feedPosts = []
+    userFeed = html.find_all('div', attrs={'class':'timeline-item'})
+    if userFeed != []:
+        for post in userFeed[:-1]:
+            if 'show-more' in str(post):
+                continue
+            date_time_str = post.find('span', attrs={'class':'tweet-date'}).find('a')['title'].replace(",","")
+
+            if post.find('div', attrs={'class':'pinned'}):
+                if post.find('div', attrs={'class':'pinned'}).find('span', attrs={'icon-pin'}):
+                    continue
+
+            tweet = {}
+            tweet['op'] = post.find('a', attrs={'class':'username'}).text
+            tweet['twitterName'] = post.find('a', attrs={'class':'fullname'}).text
+            tweet['timeStamp'] = str(datetime.datetime.strptime(date_time_str, '%d/%m/%Y %H:%M:%S'))
+            tweet['date'] = post.find('span', attrs={'class':'tweet-date'}).find('a').text
+            tweet['content'] = Markup(post.find('div',  attrs={'class':'tweet-content'}).decode_contents())
+            
+            if post.find('div', attrs={'class':'retweet-header'}):
+                tweet['username'] = post.find('div', attrs={'class':'retweet-header'}).find('div', attrs={'class':'icon-container'}).text
+                tweet['isRT'] = True
+            else:
+                tweet['username'] = tweet['op']
+                tweet['isRT'] = False
+            
+            tweet['profilePic'] = NITTERINSTANCE+post.find('a', attrs={'class':'tweet-avatar'}).find('img')['src'][1:]
+            tweet['url'] = NITTERINSTANCE + post.find('a', attrs={'class':'tweet-link'})['href'][1:]
+            if post.find('div', attrs={'class':'quote'}):
+                tweet['isReply'] = True
+                quote = post.find('div', attrs={'class':'quote'})
+                if quote.find('div',  attrs={'class':'quote-text'}):
+                    tweet['replyingTweetContent'] = Markup(quote.find('div',  attrs={'class':'quote-text'}))
+                    
+                if quote.find('a', attrs={'class':'still-image'}):
+                    tweet['replyAttachedImg'] = NITTERINSTANCE+quote.find('a', attrs={'class':'still-image'})['href'][1:]
+                
+                tweet['replyingUser']=quote.find('a',  attrs={'class':'username'}).text
+                post.find('div', attrs={'class':'quote'}).decompose()
+
+            if post.find('div',  attrs={'class':'attachments'}):
+                if not post.find(class_='quote'):
+                    if  post.find('div',  attrs={'class':'attachments'}).find('a', attrs={'class':'still-image'}):
+                        tweet['attachedImg'] = NITTERINSTANCE + post.find('div',  attrs={'class':'attachments'}).find('a')['href'][1:]
+            feedPosts.append(tweet)
+    else:
+        return {"emptyFeed": True}
+    return feedPosts
--- a/youtube/util.py
+++ b/youtube/util.py
@ -147,7 +147,7 @@ def bypass_captcha(session, response, url, cookies):


        inputs['g-recaptcha-response'] = response['solution']['gRecaptchaResponse']
-        print(response['solution'])
+        print(response)
        # Print POST request headers
        print(requests.post("https://youtube.com/das_captcha", data=inputs,
                            headers={"Content-Type": "application/x-www-form-urlencoded",
@ -204,7 +204,7 @@ def fetch_url_response(url, headers=(), timeout=15, data=None,

        session = requests.Session()
        print("Starting python GET request to "+url+"...")
-        response = session.get(url, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; rv:78.0) Gecko/20100101 Firefox/78.0'})
+        response = session.get(url, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; rv:78.0) Gecko/20100101 Firefox/78.0', "Accept-Language": "en-US,en;q=0.5"})

        # Strings that appear when there's a Captcha.
        string_de = "Fülle das folgende Feld aus, um YouTube weiter zu nutzen."