daily pastebin goal
44%
SHARE
TWEET

Untitled

a guest Apr 20th, 2018 62 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. print('Loading Libraries')
  2.  
  3. # Standard Libraries
  4. import pandas as pd
  5. import numpy as np
  6. from datetime import datetime
  7.  
  8. # URL Parser
  9. from urllib.parse import urlparse
  10.  
  11. # Reddit API
  12. import praw
  13.  
  14. # Sentiment and NLP TextBlob
  15. from textblob import TextBlob
  16.  
  17. # Newspaper3k
  18. from newspaper import Article
  19.  
  20. # Subreddit Scraper Function
  21. # from Scraper_Library import subreddit_title_scraper
  22.  
  23. print('Completed')
  24.  
  25.  
  26. print('Loading Reddit Params')
  27.  
  28. fileObj = open('Scraper_Params.dat', mode='r')
  29.  
  30. reddit_params = {}
  31.  
  32. for line in fileObj:
  33.     line = line.strip()
  34.    
  35.     key_value = line.split('=')
  36.     if len(key_value) == 2:
  37.         reddit_params[key_value[0].strip()] = key_value[1].strip()
  38.  
  39. print('Complete')
  40. print('Assigning Variables')
  41.  
  42. red = praw.Reddit(client_id=      reddit_params['red_client_id'],
  43.                   client_secret=  reddit_params['red_client_secret'],
  44.                   password=       reddit_params['red_password'],
  45.                   user_agent=     reddit_params['red_user_agent'],
  46.                   username=       reddit_params['red_username'])
  47.  
  48. blu = praw.Reddit(client_id=      reddit_params['blu_client_id'],
  49.                   client_secret=  reddit_params['blu_client_secret'],
  50.                   password=       reddit_params['blu_password'],
  51.                   user_agent=     reddit_params['blu_user_agent'],
  52.                   username=       reddit_params['blu_username'])
  53.  
  54. red_sub_list = reddit_params['red_list'].strip().split(', ')
  55. print(red_sub_list)
  56. blu_sub_list = reddit_params['blu_list'].strip().split(', ')
  57.  
  58.  
  59. sub_limit    = int(reddit_params['limit_per_sub'].strip().split(', ')[0])
  60.  
  61. print('Complete')
  62.  
  63. def subreddit_title_scraper(sub_list, api, limit, df = True):
  64.    
  65.     global red_sub_list
  66.    
  67.     posts_dict = {"post title"        : [],
  68.                   "subreddit"         : [],
  69.                   "is article"        : [],
  70.                   "article title"     : [],
  71.                   "title polarity"    : [],
  72.                   "title objectivity" : [],
  73.                   "keywords"          : [],
  74.                   "domain"            : [],
  75.                   "link"              : [],
  76.                   "author"            : [],
  77.                   "date"              : [],
  78.                   "target"            : [],
  79.                    }
  80.    
  81.     article_count = 0
  82.     invalid_links = 0
  83.    
  84.     for sub in sub_list:
  85.         submissions = (x for x in api.subreddit(sub).hot(limit=limit) if not x.stickied)
  86.        
  87.         for post in submissions:
  88.            
  89.             if sub_list is red_sub_list:
  90.                 posts_dict['target'].append(True)
  91.                
  92.             if sub_list == blu_sub_list:
  93.                 posts_dict['target'].append(False)
  94.            
  95.             posts_dict["post title"].append(post.title)           ## praw reddit scraping to dict##
  96.             posts_dict["link"].append(post.url)
  97.             posts_dict["subreddit"].append(sub)
  98.             posts_dict["date"].append(datetime.fromtimestamp(post.created_utc))
  99.            
  100.             parsed_url = urlparse(post.url)                       ## Parse URL for domain
  101.             posts_dict['domain'].append(parsed_url.netloc)
  102.            
  103.             post_blob = TextBlob(post.title)
  104.             posts_dict["title polarity"].append(post_blob.sentiment[0])
  105.             posts_dict["title objectivity"].append(post_blob.sentiment[1])
  106.             posts_dict["keywords"].append(post_blob.noun_phrases)
  107.            
  108.             article = Article(post.url)                           ## Instantiate newspaper3k library ##
  109.             if article.is_valid_url:                              ## Is post a URL?  ##
  110.                
  111.                 try:
  112.                     article.download()
  113.                     article.parse()
  114.                 except:
  115.                     posts_dict["is article"].append(False)
  116.                     posts_dict["article title"].append(np.nan)
  117.                     posts_dict["author"].append(np.nan)
  118.                     continue
  119.                
  120.                 if article.is_valid_body():                       ## Is post an article?  ##
  121.                    
  122.                     article_count += 1
  123.                     posts_dict["is article"].append(True)
  124.                     posts_dict["article title"].append(article.title)    
  125.                     if article.authors != []:
  126.                         posts_dict["author"].append(article.authors)
  127.                    
  128.                     else:
  129.                         posts_dict["author"].append(np.nan)
  130.                    
  131.                     if article_count % 5 == 0:
  132.                         print(f"Added {article_count} articles")
  133.                
  134.                 else:
  135.                                                    
  136.                     invalid_links += 1
  137.                     posts_dict["is article"].append(False)
  138.                     posts_dict["article title"].append(np.nan)
  139.                     posts_dict["author"].append(np.nan)
  140.                    
  141.                     if invalid_links % 5 == 0:
  142.                         print(f"{invalid_links} invalid links skipped")
  143.                    
  144.     if df:
  145.        
  146.         print(f"creating data frame from {article_count + invalid_links} links")
  147.        
  148.         posts_df = pd.DataFrame(posts_dict)                             ## Make it a dataframe ##
  149.         posts_df =posts_df[["subreddit", "post title", "keywords",
  150.                             "title polarity", "title objectivity",
  151.                             "domain", "is article", "article title" ,                            
  152.                             "link", "author", "date", "target"]]
  153.        
  154.         print(f"Done processing {article_count} articles and {invalid_links} non-articles as dataframe")
  155.        
  156.         return posts_df
  157.                
  158.     else:
  159.         print(f"Done processing {article_count} articles and {invalid_links} non-articles as dictionary")
  160.        
  161.         return posts_dict
  162.    
  163. print(f"Pulling {sub_limit} posts from {str(blu_sub_list)} and {str(red_sub_list)}")
  164.  
  165. dfb = subreddit_title_scraper(red_sub_list, red, sub_limit, df = True)
  166. dfr = subreddit_title_scraper(blu_sub_list, blu, sub_limit, df = True)
  167.  
  168. print('Complete')
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
 
Top