erktheerk

NSAScrapper

Feb 2nd, 2015
350
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 6.30 KB | None | 0 0
  1. #/u/GoldenSights
  2. import praw # simple interface to the reddit API, also handles rate limiting of requests
  3. import time
  4. import datetime
  5. import pickle
  6.  
  7. '''USER CONFIGURATION'''
  8.  
  9. USERNAME  = "erktheerk"
  10. #This is the bot's Username. In order to send mail, he must have some amount of Karma.
  11. PASSWORD  = ""
  12. #This is the bot's Password.
  13. USERAGENT = "NSALeaks Content scrapper. To be used to populate a list to be used for More Article section of /r/NSALeaks wiki"
  14. #This is a short description of what the bot does. For example "/u/GoldenSights' Newsletter bot"
  15. SUBREDDIT = "NSALeaks"
  16. #This is the sub or list of subs to scan for new posts. For a single sub, use "sub1". For multiple subs, use "sub1+sub2+sub3+...". For all use "all"
  17. KEYWORDS = ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z"]
  18. #Words to look for
  19. KEYDOMAINS = []
  20. #Domains to look for
  21. KEYNAMES = [""]
  22. #Names to look for
  23.  
  24. IGNORESELF = False
  25. #Do you want the bot to dump selfposts? Use True or False (Use capitals! No quotations!)
  26. TIMESTAMP = '%A %d %B %Y'
  27. #The time format.
  28. #  "%A %d %B %Y" = "Wendesday 04 June 2014"
  29. #http://docs.python.org/2/library/time.html#time.strftime
  30.  
  31. HEADER = ""
  32. #Put this at the top of the .txt file
  33.  
  34. #FORMAT = "_timestamp_: [_title_](_url_) - /u/_author_ - [**Discussion**](_nplink_)"
  35. #FORMAT = "_flairtext_ _author_ - [_title_](_nplink_)\n"
  36. FORMAT = ">>\n* _flairtext_ [_title_](_url_) - /u/_author_ - [**Discussion**](_nplink_)\n>>"
  37. TSFORMAT = ">_timestamp_\n"
  38. #USE THESE INJECTORS TO CREATE CUSTOM OUTPUT
  39. #_timestamp_ which follows the TIMESTAMP format
  40. #_title_
  41. #_url_
  42. #_subreddit_
  43. #_nplink_
  44. #_author_
  45.  
  46. PRINTFILE = "nsa"
  47. #Name of the file that will be produced. Do not type the file extension
  48.  
  49. MAXPOSTS = 800
  50. #This is how many posts you want to retrieve all at once.
  51.  
  52. '''All done!'''
  53.  
  54. for m in ["_date", "_author", "_subreddit", "_title"]:
  55.     clistfile = open(PRINTFILE + m + '.txt', "a+")
  56.     clistfile.close()
  57. #This is a hackjob way of creating the files if they do not exist.
  58.  
  59. MAXS = str(MAXPOSTS)
  60. try:
  61.     import bot #This is a file in my python library which contains my Bot's username and password. I can push code to Git without showing credentials
  62.     USERNAME = bot.getuG()
  63.     PASSWORD = bot.getpG()
  64.     USERAGENT = bot.getaG()
  65. except ImportError:
  66.     pass
  67.  
  68. print('Logging in ' + USERNAME)
  69. r = praw.Reddit(USERAGENT)
  70. r.login(USERNAME, PASSWORD)
  71.  
  72. def work(lista):
  73.     global listfile
  74.     if HEADER != "":
  75.         print(HEADER, file=listfile)
  76.     previous_timestamp = ""
  77.     for post in lista:
  78.         timestamp = post.created_utc
  79.         timestamp = datetime.datetime.fromtimestamp(int(timestamp)).strftime(TIMESTAMP)
  80.         final = FORMAT
  81.         if timestamp != previous_timestamp:
  82.             final = TSFORMAT + final
  83.         final = final.replace('_timestamp_', timestamp)
  84.         final = final.replace('_title_', post.title)
  85.         flair_text = post.link_flair_text if post.link_flair_text else ""
  86.         flair_css = post.link_flair_css_class if post.link_flair_css_class else ""
  87.         post.link_flair_text = flair_text
  88.         post.link_flair_css_class = flair_css
  89.         final = final.replace('_flairtext_', flair_text)
  90.         final = final.replace('_flaircss_', flair_css)
  91.         try:
  92.             final = final.replace('_author_', post.author.name)
  93.         except Exception:
  94.             final = final.replace('_author_', '[DELETED]')
  95.         final = final.replace('_subreddit_', post.subreddit.display_name)
  96.         url = post.url
  97.         url = url.replace('http://www.reddit.com', 'https://www.reddit.com')
  98.         final = final.replace('_url_', url)
  99.         slink = post.short_link
  100.         #slink = slink.replace('http://', 'https://www.')
  101.         final = final.replace('_nplink_', slink)
  102.         final = final.replace('_flairtext_', flair_text)
  103.         print(final, file=listfile)
  104.         previous_timestamp = timestamp
  105.  
  106.  
  107.  
  108. lista = []
  109. count =  0
  110. counta = 0
  111. try:
  112.     print('Scanning.')
  113.     subreddit = r.get_subreddit(SUBREDDIT)
  114.     posts = subreddit.get_new(limit=MAXPOSTS)
  115.     for post in posts:
  116.         if not post.is_self or IGNORESELF == False:
  117.             try:
  118.                 author = post.author.name
  119.             except Exception:
  120.                 author = '[DELETED]'
  121.             if any(m.lower() in post.title.lower() for m in KEYWORDS) \
  122.             or any(m.lower() in post.url.lower() for m in KEYDOMAINS) \
  123.             or any(m.lower() == author.lower() for m in KEYNAMES):
  124.                 lista.append(post)
  125.                 counta += 1
  126.         count += 1
  127.         print(str(count) + ' / ' + MAXS + ' | ' + str(counta))
  128.    
  129.     for item in lista:
  130.         if item.author == None:
  131.             item.author = '[DELETED]'
  132. except Exception:
  133.     print('EMERGENCY')
  134.  
  135. print('Collected ' + str(counta) + ' items.')
  136. print('Writing Time file')
  137. lista.sort(key=lambda x: x.created_utc, reverse=True)
  138. listfile = open(PRINTFILE + '_date.txt', 'w', encoding='utf-8')
  139. work(lista)
  140. listfile.close()
  141.  
  142. print('Writing Subreddit file')
  143. lista.sort(key=lambda x: x.subreddit.display_name.lower(), reverse=False)
  144. listfile = open(PRINTFILE + '_subreddit.txt', 'w', encoding='utf-8')
  145. work(lista)
  146. listfile.close()
  147.  
  148. print('Writing Title file')
  149. lista.sort(key=lambda x: x.title.lower(), reverse=False)
  150. listfile = open(PRINTFILE + '_title.txt', 'w', encoding='utf-8')
  151. work(lista)
  152. listfile.close()
  153.  
  154. print('Writing Author file')
  155. lista.sort(key=lambda x: x.author.name.lower(), reverse=False)
  156. listfile = open(PRINTFILE + '_author.txt', 'w', encoding='utf-8')
  157. work(lista)
  158. listfile.close()
  159.  
  160. print('Writing flair file')
  161. #lista.sort(key=lambda x: x.link_flair_text.lower(), reverse=False)
  162. now = datetime.datetime.now(datetime.timezone.utc).timestamp()
  163. lista.sort(key=lambda x: (x.link_flair_text, now-x.created_utc))
  164. for index in range(len(lista)):
  165.     if lista[index].link_flair_text != "":
  166.         lista = lista[index:] + lista[:index]
  167.         break
  168. listfile = open(PRINTFILE + '_flair.txt', 'w', encoding='utf-8')
  169. work(lista)
  170. listfile.close()
  171.  
  172. print('Saving to Pickle.')
  173. class Posted(object):
  174.     pass
  175. listc = []
  176. for item in lista:
  177.     obj = Posted()
  178.     obj.id = item.id
  179.     obj.fullname = item.fullname
  180.     obj.created_utc = item.created_utc
  181.     obj.title = item.title
  182.     obj.subreddit = item.subreddit.display_name
  183.     obj.url = item.url
  184.     obj.short_link = item.short_link
  185.     try:
  186.         obj.author = item.author.name
  187.     except:
  188.         obj.author = '[DELETED]'
  189.     if item.is_self == True:
  190.         obj.is_self = True
  191.         obj.selftext = item.selftext
  192.     else:
  193.         obj.is_self = False
  194.     listc.append(obj.__dict__)
  195. filec = open(PRINTFILE + '.p', 'wb')
  196. pickle.dump(listc, filec)
  197. print('Done.')
Advertisement
Add Comment
Please, Sign In to add comment