diff --git a/tw_data/feed.py b/tw_data/feed.py new file mode 100644 index 0000000..e8cab7b --- /dev/null +++ b/tw_data/feed.py @@ -0,0 +1,75 @@ +from requests_futures.sessions import FuturesSession +from werkzeug.datastructures import Headers +from flask import Markup +from concurrent.futures import as_completed +from numerize import numerize +from bs4 import BeautifulSoup +from re import findall +import time, datetime +import requests +import bleach +import urllib +import json +import re + +NITTERINSTANCE = "https://nitter.net/" + +def get_feed(usernames, maxOld): + ''' + Returns feed tweets given a set of usernames + ''' + feedTweets = [] + with FuturesSession() as session: + futures = [session.get('{instance}{user}'.format(instance=NITTERINSTANCE, user=u)) for u in usernames] + for future in as_completed(futures): + res = future.result().content.decode('utf-8') + html = BeautifulSoup(res, "html.parser") + userFeed = html.find_all('div', attrs={'class':'timeline-item'}) + if userFeed != []: + for post in userFeed[:-1]: + tweet = {} + date_time_str = post.find('span', attrs={'class':'tweet-date'}).find('a')['title'].replace(",","") + time = datetime.datetime.now() - datetime.datetime.strptime(date_time_str, '%d/%m/%Y %H:%M:%S') + if time.days >= maxOld: + continue + + if post.find('div', attrs={'class':'pinned'}): + if post.find('div', attrs={'class':'pinned'}).find('span', attrs={'icon-pin'}): + continue + + tweet['originalPoster'] = post.find('a', attrs={'class':'username'}).text + tweet['twitterName'] = post.find('a', attrs={'class':'fullname'}).text + tweet['timeStamp'] = datetime.datetime.strptime(date_time_str, '%d/%m/%Y %H:%M:%S') + tweet['date'] = post.find('span', attrs={'class':'tweet-date'}).find('a').text + tweet['content'] = Markup(post.find('div', attrs={'class':'tweet-content'})) + + if post.find('div', attrs={'class':'retweet-header'}): + tweet['username'] = post.find('div', attrs={'class':'retweet-header'}).find('div', attrs={'class':'icon-container'}).text + tweet['isRT'] = True + else: + tweet['username'] = tweet['originalPoster'] + tweet['isRT'] = False + + tweet['profilePic'] = NITTERINSTANCE+post.find('a', attrs={'class':'tweet-avatar'}).find('img')['src'][1:] + url = NITTERINSTANCE + post.find('a', attrs={'class':'tweet-link'})['href'][1:] + if post.find('div', attrs={'class':'quote'}): + tweet['isReply'] = True + tweet['quote'] = post.find('div', attrs={'class':'quote'}) + if tweet['quote'].find('div', attrs={'class':'quote-text'}): + tweet['replyingTweetContent'] = Markup(tweet['quote'].find('div', attrs={'class':'quote-text'})) + + if tweet['quote'].find('a', attrs={'class':'still-image'}): + tweet['replyAttachedImg'] = NITTERINSTANCE+tweet['quote'].find('a', attrs={'class':'still-image'})['href'][1:] + + if tweet['quote'].find('div', attrs={'class':'unavailable-quote'}): + tweet['replyingUser']="Unavailable" + else: + tweet['replyingUser']=tweet['quote'].find('a', attrs={'class':'username'}).text + post.find('div', attrs={'class':'quote'}).decompose() + + if post.find('div', attrs={'class':'attachments'}): + if not post.find(class_='quote'): + if post.find('div', attrs={'class':'attachments'}).find('a', attrs={'class':'still-image'}): + attachedImg = NITTERINSTANCE + post.find('div', attrs={'class':'attachments'}).find('a')['href'][1:] + feedTweets.append(tweet) + return feedTweets \ No newline at end of file diff --git a/tw_data/user.py b/tw_data/user.py new file mode 100644 index 0000000..5fa9c2c --- /dev/null +++ b/tw_data/user.py @@ -0,0 +1,116 @@ +from flask import Markup +from requests_futures.sessions import FuturesSession +from werkzeug.datastructures import Headers +from concurrent.futures import as_completed +from numerize import numerize +from bs4 import BeautifulSoup +from re import findall +import time, datetime +import requests +import bleach +import urllib +import json +import re + +########################## +#### Config variables #### +########################## +NITTERINSTANCE = 'https://nitter.net/' + +def get_uer_info(username): + response = urllib.request.urlopen('{instance}{user}'.format(instance=NITTERINSTANCE, user=username)).read() + #rssFeed = feedparser.parse(response.content) + + html = BeautifulSoup(str(response), "lxml") + if html.body.find('div', attrs={'class':'error-panel'}): + return False + else: + html = html.body.find('div', attrs={'class':'profile-card'}) + + if html.find('a', attrs={'class':'profile-card-fullname'}): + fullName = html.find('a', attrs={'class':'profile-card-fullname'}).getText().encode('latin1').decode('unicode_escape').encode('latin1').decode('utf8') + else: + fullName = None + + if html.find('div', attrs={'class':'profile-bio'}): + profileBio = html.find('div', attrs={'class':'profile-bio'}).getText().encode('latin1').decode('unicode_escape').encode('latin1').decode('utf8') + else: + profileBio = None + + user = { + "profileFullName":fullName, + "profileUsername":html.find('a', attrs={'class':'profile-card-username'}).string.encode('latin_1').decode('unicode_escape').encode('latin_1').decode('utf8'), + "profileBio":profileBio, + "tweets":html.find_all('span', attrs={'class':'profile-stat-num'})[0].string, + "following":html.find_all('span', attrs={'class':'profile-stat-num'})[1].string, + "followers":numerize.numerize(int(html.find_all('span', attrs={'class':'profile-stat-num'})[2].string.replace(",",""))), + "likes":html.find_all('span', attrs={'class':'profile-stat-num'})[3].string, + "profilePic":"{instance}{pic}".format(instance=NITTERINSTANCE, pic=html.find('a', attrs={'class':'profile-card-avatar'})['href'][1:]) + } + return user + +def get_tweets(user, page=1): + feed = urllib.request.urlopen('{instance}{user}'.format(instance=NITTERINSTANCE, user=user)).read() + #Gather feedPosts + res = feed.decode('utf-8') + html = BeautifulSoup(res, "html.parser") + feedPosts = get_feed_tweets(html) + + if page == 2: + nextPage = html.find('div', attrs={'class':'show-more'}).find('a')['href'] + print('{instance}{user}{page}'.format(instance=NITTERINSTANCE, user=user, page=nextPage)) + feed = urllib.request.urlopen('{instance}{user}{page}'.format(instance=NITTERINSTANCE, user=user, page=nextPage)).read() + res = feed.decode('utf-8') + html = BeautifulSoup(res, "html.parser") + feedPosts = get_feed_tweets(html) + return feedPosts + +def get_feed_tweets(html): + feedPosts = [] + userFeed = html.find_all('div', attrs={'class':'timeline-item'}) + if userFeed != []: + for post in userFeed[:-1]: + if 'show-more' in str(post): + continue + date_time_str = post.find('span', attrs={'class':'tweet-date'}).find('a')['title'].replace(",","") + + if post.find('div', attrs={'class':'pinned'}): + if post.find('div', attrs={'class':'pinned'}).find('span', attrs={'icon-pin'}): + continue + + tweet = {} + tweet['op'] = post.find('a', attrs={'class':'username'}).text + tweet['twitterName'] = post.find('a', attrs={'class':'fullname'}).text + tweet['timeStamp'] = str(datetime.datetime.strptime(date_time_str, '%d/%m/%Y %H:%M:%S')) + tweet['date'] = post.find('span', attrs={'class':'tweet-date'}).find('a').text + tweet['content'] = Markup(post.find('div', attrs={'class':'tweet-content'}).decode_contents()) + + if post.find('div', attrs={'class':'retweet-header'}): + tweet['username'] = post.find('div', attrs={'class':'retweet-header'}).find('div', attrs={'class':'icon-container'}).text + tweet['isRT'] = True + else: + tweet['username'] = tweet['op'] + tweet['isRT'] = False + + tweet['profilePic'] = NITTERINSTANCE+post.find('a', attrs={'class':'tweet-avatar'}).find('img')['src'][1:] + tweet['url'] = NITTERINSTANCE + post.find('a', attrs={'class':'tweet-link'})['href'][1:] + if post.find('div', attrs={'class':'quote'}): + tweet['isReply'] = True + quote = post.find('div', attrs={'class':'quote'}) + if quote.find('div', attrs={'class':'quote-text'}): + tweet['replyingTweetContent'] = Markup(quote.find('div', attrs={'class':'quote-text'})) + + if quote.find('a', attrs={'class':'still-image'}): + tweet['replyAttachedImg'] = NITTERINSTANCE+quote.find('a', attrs={'class':'still-image'})['href'][1:] + + tweet['replyingUser']=quote.find('a', attrs={'class':'username'}).text + post.find('div', attrs={'class':'quote'}).decompose() + + if post.find('div', attrs={'class':'attachments'}): + if not post.find(class_='quote'): + if post.find('div', attrs={'class':'attachments'}).find('a', attrs={'class':'still-image'}): + tweet['attachedImg'] = NITTERINSTANCE + post.find('div', attrs={'class':'attachments'}).find('a')['href'][1:] + feedPosts.append(tweet) + else: + return {"emptyFeed": True} + return feedPosts \ No newline at end of file diff --git a/youtube/util.py b/youtube/util.py index 0400ef6..14222bc 100644 --- a/youtube/util.py +++ b/youtube/util.py @@ -147,7 +147,7 @@ def bypass_captcha(session, response, url, cookies): inputs['g-recaptcha-response'] = response['solution']['gRecaptchaResponse'] - print(response['solution']) + print(response) # Print POST request headers print(requests.post("https://youtube.com/das_captcha", data=inputs, headers={"Content-Type": "application/x-www-form-urlencoded", @@ -204,7 +204,7 @@ def fetch_url_response(url, headers=(), timeout=15, data=None, session = requests.Session() print("Starting python GET request to "+url+"...") - response = session.get(url, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; rv:78.0) Gecko/20100101 Firefox/78.0'}) + response = session.get(url, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; rv:78.0) Gecko/20100101 Firefox/78.0', "Accept-Language": "en-US,en;q=0.5"}) # Strings that appear when there's a Captcha. string_de = "Fülle das folgende Feld aus, um YouTube weiter zu nutzen."