Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- print('Loading Libraries')
- # Standard Libraries
- import pandas as pd
- import numpy as np
- from datetime import datetime
- # URL Parser
- from urllib.parse import urlparse
- # Reddit API
- import praw
- # Sentiment and NLP TextBlob
- from textblob import TextBlob
- # Newspaper3k
- from newspaper import Article
- # Subreddit Scraper Function
- # from Scraper_Library import subreddit_title_scraper
- print('Completed')
- print('Loading Reddit Params')
- fileObj = open('Scraper_Params.dat', mode='r')
- reddit_params = {}
- for line in fileObj:
- line = line.strip()
- key_value = line.split('=')
- if len(key_value) == 2:
- reddit_params[key_value[0].strip()] = key_value[1].strip()
- print('Complete')
- print('Assigning Variables')
- red = praw.Reddit(client_id= reddit_params['red_client_id'],
- client_secret= reddit_params['red_client_secret'],
- password= reddit_params['red_password'],
- user_agent= reddit_params['red_user_agent'],
- username= reddit_params['red_username'])
- blu = praw.Reddit(client_id= reddit_params['blu_client_id'],
- client_secret= reddit_params['blu_client_secret'],
- password= reddit_params['blu_password'],
- user_agent= reddit_params['blu_user_agent'],
- username= reddit_params['blu_username'])
- red_sub_list = reddit_params['red_list'].strip().split(', ')
- print(red_sub_list)
- blu_sub_list = reddit_params['blu_list'].strip().split(', ')
- sub_limit = int(reddit_params['limit_per_sub'].strip().split(', ')[0])
- print('Complete')
- def subreddit_title_scraper(sub_list, api, limit, df = True):
- global red_sub_list
- posts_dict = {"post title" : [],
- "subreddit" : [],
- "is article" : [],
- "article title" : [],
- "title polarity" : [],
- "title objectivity" : [],
- "keywords" : [],
- "domain" : [],
- "link" : [],
- "author" : [],
- "date" : [],
- "target" : [],
- }
- article_count = 0
- invalid_links = 0
- for sub in sub_list:
- submissions = (x for x in api.subreddit(sub).hot(limit=limit) if not x.stickied)
- for post in submissions:
- if sub_list is red_sub_list:
- posts_dict['target'].append(True)
- if sub_list == blu_sub_list:
- posts_dict['target'].append(False)
- posts_dict["post title"].append(post.title) ## praw reddit scraping to dict##
- posts_dict["link"].append(post.url)
- posts_dict["subreddit"].append(sub)
- posts_dict["date"].append(datetime.fromtimestamp(post.created_utc))
- parsed_url = urlparse(post.url) ## Parse URL for domain
- posts_dict['domain'].append(parsed_url.netloc)
- post_blob = TextBlob(post.title)
- posts_dict["title polarity"].append(post_blob.sentiment[0])
- posts_dict["title objectivity"].append(post_blob.sentiment[1])
- posts_dict["keywords"].append(post_blob.noun_phrases)
- article = Article(post.url) ## Instantiate newspaper3k library ##
- if article.is_valid_url: ## Is post a URL? ##
- try:
- article.download()
- article.parse()
- except:
- posts_dict["is article"].append(False)
- posts_dict["article title"].append(np.nan)
- posts_dict["author"].append(np.nan)
- continue
- if article.is_valid_body(): ## Is post an article? ##
- article_count += 1
- posts_dict["is article"].append(True)
- posts_dict["article title"].append(article.title)
- if article.authors != []:
- posts_dict["author"].append(article.authors)
- else:
- posts_dict["author"].append(np.nan)
- if article_count % 5 == 0:
- print(f"Added {article_count} articles")
- else:
- invalid_links += 1
- posts_dict["is article"].append(False)
- posts_dict["article title"].append(np.nan)
- posts_dict["author"].append(np.nan)
- if invalid_links % 5 == 0:
- print(f"{invalid_links} invalid links skipped")
- if df:
- print(f"creating data frame from {article_count + invalid_links} links")
- posts_df = pd.DataFrame(posts_dict) ## Make it a dataframe ##
- posts_df =posts_df[["subreddit", "post title", "keywords",
- "title polarity", "title objectivity",
- "domain", "is article", "article title" ,
- "link", "author", "date", "target"]]
- print(f"Done processing {article_count} articles and {invalid_links} non-articles as dataframe")
- return posts_df
- else:
- print(f"Done processing {article_count} articles and {invalid_links} non-articles as dictionary")
- return posts_dict
- print(f"Pulling {sub_limit} posts from {str(blu_sub_list)} and {str(red_sub_list)}")
- dfb = subreddit_title_scraper(red_sub_list, red, sub_limit, df = True)
- dfr = subreddit_title_scraper(blu_sub_list, blu, sub_limit, df = True)
- print('Complete')
Add Comment
Please, Sign In to add comment