Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #/u/GoldenSights
- import praw # simple interface to the reddit API, also handles rate limiting of requests
- import time
- import datetime
- import pickle
- '''USER CONFIGURATION'''
- USERNAME = "erktheerk"
- #This is the bot's Username. In order to send mail, he must have some amount of Karma.
- PASSWORD = ""
- #This is the bot's Password.
- USERAGENT = "NSALeaks Content scrapper. To be used to populate a list to be used for More Article section of /r/NSALeaks wiki"
- #This is a short description of what the bot does. For example "/u/GoldenSights' Newsletter bot"
- SUBREDDIT = "NSALeaks"
- #This is the sub or list of subs to scan for new posts. For a single sub, use "sub1". For multiple subs, use "sub1+sub2+sub3+...". For all use "all"
- KEYWORDS = ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z"]
- #Words to look for
- KEYDOMAINS = []
- #Domains to look for
- KEYNAMES = [""]
- #Names to look for
- IGNORESELF = False
- #Do you want the bot to dump selfposts? Use True or False (Use capitals! No quotations!)
- TIMESTAMP = '%A %d %B %Y'
- #The time format.
- # "%A %d %B %Y" = "Wendesday 04 June 2014"
- #http://docs.python.org/2/library/time.html#time.strftime
- HEADER = ""
- #Put this at the top of the .txt file
- #FORMAT = "_timestamp_: [_title_](_url_) - /u/_author_ - [**Discussion**](_nplink_)"
- #FORMAT = "_flairtext_ _author_ - [_title_](_nplink_)\n"
- FORMAT = ">>\n* _flairtext_ [_title_](_url_) - /u/_author_ - [**Discussion**](_nplink_)\n>>"
- TSFORMAT = ">_timestamp_\n"
- #USE THESE INJECTORS TO CREATE CUSTOM OUTPUT
- #_timestamp_ which follows the TIMESTAMP format
- #_title_
- #_url_
- #_subreddit_
- #_nplink_
- #_author_
- PRINTFILE = "nsa"
- #Name of the file that will be produced. Do not type the file extension
- MAXPOSTS = 800
- #This is how many posts you want to retrieve all at once.
- '''All done!'''
- for m in ["_date", "_author", "_subreddit", "_title"]:
- clistfile = open(PRINTFILE + m + '.txt', "a+")
- clistfile.close()
- #This is a hackjob way of creating the files if they do not exist.
- MAXS = str(MAXPOSTS)
- try:
- import bot #This is a file in my python library which contains my Bot's username and password. I can push code to Git without showing credentials
- USERNAME = bot.getuG()
- PASSWORD = bot.getpG()
- USERAGENT = bot.getaG()
- except ImportError:
- pass
- print('Logging in ' + USERNAME)
- r = praw.Reddit(USERAGENT)
- r.login(USERNAME, PASSWORD)
- def work(lista):
- global listfile
- if HEADER != "":
- print(HEADER, file=listfile)
- previous_timestamp = ""
- for post in lista:
- timestamp = post.created_utc
- timestamp = datetime.datetime.fromtimestamp(int(timestamp)).strftime(TIMESTAMP)
- final = FORMAT
- if timestamp != previous_timestamp:
- final = TSFORMAT + final
- final = final.replace('_timestamp_', timestamp)
- final = final.replace('_title_', post.title)
- flair_text = post.link_flair_text if post.link_flair_text else ""
- flair_css = post.link_flair_css_class if post.link_flair_css_class else ""
- post.link_flair_text = flair_text
- post.link_flair_css_class = flair_css
- final = final.replace('_flairtext_', flair_text)
- final = final.replace('_flaircss_', flair_css)
- try:
- final = final.replace('_author_', post.author.name)
- except Exception:
- final = final.replace('_author_', '[DELETED]')
- final = final.replace('_subreddit_', post.subreddit.display_name)
- url = post.url
- url = url.replace('http://www.reddit.com', 'https://www.reddit.com')
- final = final.replace('_url_', url)
- slink = post.short_link
- #slink = slink.replace('http://', 'https://www.')
- final = final.replace('_nplink_', slink)
- final = final.replace('_flairtext_', flair_text)
- print(final, file=listfile)
- previous_timestamp = timestamp
- lista = []
- count = 0
- counta = 0
- try:
- print('Scanning.')
- subreddit = r.get_subreddit(SUBREDDIT)
- posts = subreddit.get_new(limit=MAXPOSTS)
- for post in posts:
- if not post.is_self or IGNORESELF == False:
- try:
- author = post.author.name
- except Exception:
- author = '[DELETED]'
- if any(m.lower() in post.title.lower() for m in KEYWORDS) \
- or any(m.lower() in post.url.lower() for m in KEYDOMAINS) \
- or any(m.lower() == author.lower() for m in KEYNAMES):
- lista.append(post)
- counta += 1
- count += 1
- print(str(count) + ' / ' + MAXS + ' | ' + str(counta))
- for item in lista:
- if item.author == None:
- item.author = '[DELETED]'
- except Exception:
- print('EMERGENCY')
- print('Collected ' + str(counta) + ' items.')
- print('Writing Time file')
- lista.sort(key=lambda x: x.created_utc, reverse=True)
- listfile = open(PRINTFILE + '_date.txt', 'w', encoding='utf-8')
- work(lista)
- listfile.close()
- print('Writing Subreddit file')
- lista.sort(key=lambda x: x.subreddit.display_name.lower(), reverse=False)
- listfile = open(PRINTFILE + '_subreddit.txt', 'w', encoding='utf-8')
- work(lista)
- listfile.close()
- print('Writing Title file')
- lista.sort(key=lambda x: x.title.lower(), reverse=False)
- listfile = open(PRINTFILE + '_title.txt', 'w', encoding='utf-8')
- work(lista)
- listfile.close()
- print('Writing Author file')
- lista.sort(key=lambda x: x.author.name.lower(), reverse=False)
- listfile = open(PRINTFILE + '_author.txt', 'w', encoding='utf-8')
- work(lista)
- listfile.close()
- print('Writing flair file')
- #lista.sort(key=lambda x: x.link_flair_text.lower(), reverse=False)
- now = datetime.datetime.now(datetime.timezone.utc).timestamp()
- lista.sort(key=lambda x: (x.link_flair_text, now-x.created_utc))
- for index in range(len(lista)):
- if lista[index].link_flair_text != "":
- lista = lista[index:] + lista[:index]
- break
- listfile = open(PRINTFILE + '_flair.txt', 'w', encoding='utf-8')
- work(lista)
- listfile.close()
- print('Saving to Pickle.')
- class Posted(object):
- pass
- listc = []
- for item in lista:
- obj = Posted()
- obj.id = item.id
- obj.fullname = item.fullname
- obj.created_utc = item.created_utc
- obj.title = item.title
- obj.subreddit = item.subreddit.display_name
- obj.url = item.url
- obj.short_link = item.short_link
- try:
- obj.author = item.author.name
- except:
- obj.author = '[DELETED]'
- if item.is_self == True:
- obj.is_self = True
- obj.selftext = item.selftext
- else:
- obj.is_self = False
- listc.append(obj.__dict__)
- filec = open(PRINTFILE + '.p', 'wb')
- pickle.dump(listc, filec)
- print('Done.')
Advertisement
Add Comment
Please, Sign In to add comment