Guest User

Untitled

a guest
Apr 20th, 2018
94
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 6.03 KB | None | 0 0
  1. print('Loading Libraries')
  2.  
  3. # Standard Libraries
  4. import pandas as pd
  5. import numpy as np
  6. from datetime import datetime
  7.  
  8. # URL Parser
  9. from urllib.parse import urlparse
  10.  
  11. # Reddit API
  12. import praw
  13.  
  14. # Sentiment and NLP TextBlob
  15. from textblob import TextBlob
  16.  
  17. # Newspaper3k
  18. from newspaper import Article
  19.  
  20. # Subreddit Scraper Function
  21. # from Scraper_Library import subreddit_title_scraper
  22.  
  23. print('Completed')
  24.  
  25.  
  26. print('Loading Reddit Params')
  27.  
  28. fileObj = open('Scraper_Params.dat', mode='r')
  29.  
  30. reddit_params = {}
  31.  
  32. for line in fileObj:
  33. line = line.strip()
  34.  
  35. key_value = line.split('=')
  36. if len(key_value) == 2:
  37. reddit_params[key_value[0].strip()] = key_value[1].strip()
  38.  
  39. print('Complete')
  40. print('Assigning Variables')
  41.  
  42. red = praw.Reddit(client_id= reddit_params['red_client_id'],
  43. client_secret= reddit_params['red_client_secret'],
  44. password= reddit_params['red_password'],
  45. user_agent= reddit_params['red_user_agent'],
  46. username= reddit_params['red_username'])
  47.  
  48. blu = praw.Reddit(client_id= reddit_params['blu_client_id'],
  49. client_secret= reddit_params['blu_client_secret'],
  50. password= reddit_params['blu_password'],
  51. user_agent= reddit_params['blu_user_agent'],
  52. username= reddit_params['blu_username'])
  53.  
  54. red_sub_list = reddit_params['red_list'].strip().split(', ')
  55. print(red_sub_list)
  56. blu_sub_list = reddit_params['blu_list'].strip().split(', ')
  57.  
  58.  
  59. sub_limit = int(reddit_params['limit_per_sub'].strip().split(', ')[0])
  60.  
  61. print('Complete')
  62.  
  63. def subreddit_title_scraper(sub_list, api, limit, df = True):
  64.  
  65. global red_sub_list
  66.  
  67. posts_dict = {"post title" : [],
  68. "subreddit" : [],
  69. "is article" : [],
  70. "article title" : [],
  71. "title polarity" : [],
  72. "title objectivity" : [],
  73. "keywords" : [],
  74. "domain" : [],
  75. "link" : [],
  76. "author" : [],
  77. "date" : [],
  78. "target" : [],
  79. }
  80.  
  81. article_count = 0
  82. invalid_links = 0
  83.  
  84. for sub in sub_list:
  85. submissions = (x for x in api.subreddit(sub).hot(limit=limit) if not x.stickied)
  86.  
  87. for post in submissions:
  88.  
  89. if sub_list is red_sub_list:
  90. posts_dict['target'].append(True)
  91.  
  92. if sub_list == blu_sub_list:
  93. posts_dict['target'].append(False)
  94.  
  95. posts_dict["post title"].append(post.title) ## praw reddit scraping to dict##
  96. posts_dict["link"].append(post.url)
  97. posts_dict["subreddit"].append(sub)
  98. posts_dict["date"].append(datetime.fromtimestamp(post.created_utc))
  99.  
  100. parsed_url = urlparse(post.url) ## Parse URL for domain
  101. posts_dict['domain'].append(parsed_url.netloc)
  102.  
  103. post_blob = TextBlob(post.title)
  104. posts_dict["title polarity"].append(post_blob.sentiment[0])
  105. posts_dict["title objectivity"].append(post_blob.sentiment[1])
  106. posts_dict["keywords"].append(post_blob.noun_phrases)
  107.  
  108. article = Article(post.url) ## Instantiate newspaper3k library ##
  109. if article.is_valid_url: ## Is post a URL? ##
  110.  
  111. try:
  112. article.download()
  113. article.parse()
  114. except:
  115. posts_dict["is article"].append(False)
  116. posts_dict["article title"].append(np.nan)
  117. posts_dict["author"].append(np.nan)
  118. continue
  119.  
  120. if article.is_valid_body(): ## Is post an article? ##
  121.  
  122. article_count += 1
  123. posts_dict["is article"].append(True)
  124. posts_dict["article title"].append(article.title)
  125. if article.authors != []:
  126. posts_dict["author"].append(article.authors)
  127.  
  128. else:
  129. posts_dict["author"].append(np.nan)
  130.  
  131. if article_count % 5 == 0:
  132. print(f"Added {article_count} articles")
  133.  
  134. else:
  135.  
  136. invalid_links += 1
  137. posts_dict["is article"].append(False)
  138. posts_dict["article title"].append(np.nan)
  139. posts_dict["author"].append(np.nan)
  140.  
  141. if invalid_links % 5 == 0:
  142. print(f"{invalid_links} invalid links skipped")
  143.  
  144. if df:
  145.  
  146. print(f"creating data frame from {article_count + invalid_links} links")
  147.  
  148. posts_df = pd.DataFrame(posts_dict) ## Make it a dataframe ##
  149. posts_df =posts_df[["subreddit", "post title", "keywords",
  150. "title polarity", "title objectivity",
  151. "domain", "is article", "article title" ,
  152. "link", "author", "date", "target"]]
  153.  
  154. print(f"Done processing {article_count} articles and {invalid_links} non-articles as dataframe")
  155.  
  156. return posts_df
  157.  
  158. else:
  159. print(f"Done processing {article_count} articles and {invalid_links} non-articles as dictionary")
  160.  
  161. return posts_dict
  162.  
  163. print(f"Pulling {sub_limit} posts from {str(blu_sub_list)} and {str(red_sub_list)}")
  164.  
  165. dfb = subreddit_title_scraper(red_sub_list, red, sub_limit, df = True)
  166. dfr = subreddit_title_scraper(blu_sub_list, blu, sub_limit, df = True)
  167.  
  168. print('Complete')
Add Comment
Please, Sign In to add comment