Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import sys
- from datetime import datetime, timedelta
- import time
- import praw
- import psycopg2
- import sqlalchemy
- from sqlalchemy import Table, Column, Integer, String, Float, DateTime
- import sqlalchemy.orm
- user_agent = "cc test"
- app_id = "
- app_secret = '
- app_uri = 'https://127.0.0.1:65010/authorize_callback'
- app_account_code =
- app_refresh =
- app_scopes = 'account creddits edit flair history identity livemanage modconfig ' \
- 'modcontributors modflair modlog modothers modposts modself modwiki ' \
- 'mysubreddits privatemessages read report save submit subscribe vote wikiedit wikiread'
- r = praw.Reddit(user_agent=user_agent)
- r.set_oauth_app_info(app_id, app_secret, app_uri)
- r.refresh_access_information(app_refresh)
- sys.setrecursionlimit(1000)
- class SearchAll(object):
- def __init__(self, query, url, sort, dt):
- # master list of all available submissions
- self.results_list = []
- self.url = url
- self.query = query
- self.sort = sort
- self.count = 0
- self.dt = dt
- self.queryStartTime = int(time.mktime(self.dt.timetuple()))
- self.queryEndTime = int(time.mktime(datetime.utcnow().timetuple()))
- self.current = 0
- self.lastCount = 0
- self.resultDict = {}
- def __call__(self):
- self.search(self)
- return self.results_list
- def search(self, after=None, paginate=False):
- limit = None
- params = {"q": self.query, "sort": self.sort}
- searchMode = 2
- #1 = normal search
- #2 = cloudsearch
- cloudSearchQuery = """(and text:'{q}' timestamp:{csq1}..{csq2})""".format(q=self.query,
- csq1=self.queryStartTime,
- csq2=self.queryEndTime)
- if paginate is True:
- try:
- if searchMode == 1:
- # results = r.get_subreddit(self.subreddit).get_new(limit=limit,
- # params={"after": self.total_list[-1].fullname})
- # results = r.search(self.query, limit=limit, sort='new', after=self.results_list[-1].fullname)
- params["after"] = "t3_" + str(after.id)
- results = r.get_content("https://www.reddit.com/r/all/search", params=params, limit=100)
- if searchMode == 2:
- #queryURL = "https://www.reddit.com/r/all/search?q=(and text:'{q}' time" \
- # "stamp:{csq1}..{csq2})&sort=new&syntax=cloudsearch".format(q=self.query,
- # csq1=self.queryStartTime,
- # csq2=self.queryEndTime)
- timeInvoked = datetime.now()
- results = r.search(cloudSearchQuery, subreddit="all", sort="new", limit=limit, syntax="cloudsearch")
- timeReturned = datetime.now()
- except IndexError:
- print(IndexError)
- return
- else:
- if searchMode == 1:
- # results = r.search(self.query, limit=limit, sort='new')
- results = r.get_content("https://www.reddit.com/r/all/search", params=params, limit=100)
- if searchMode == 2:
- #queryURL = "https://www.reddit.com/r/all/search?q=(and text:'{q}' time" \
- # "stamp:{csq1}..{csq2})&sort=new&syntax=cloudsearch".format(q=self.query,
- # csq1=self.queryStartTime,
- # csq2=self.queryEndTime)
- #print(cloudSearchQuery)
- #print(queryURL)
- #results = r.get_content(url=csQ, limit=limit)
- timeInvoked = datetime.now()
- results = r.search(cloudSearchQuery, subreddit="all", sort="new", limit=limit, syntax="cloudsearch")
- timeReturned = datetime.now()
- if searchMode == 1:
- for x in results:
- if datetime.utcfromtimestamp(x.created_utc) >= self.dt.replace(microsecond=0):
- self.results_list.append(x)
- self.count += 1
- after = x
- if searchMode == 2:
- for x in results:
- result = {'created': x.created_utc, 'fullname': x.fullname, 'url': x.url, 'title': x.title,
- 'sub': x.domain, 'author': x.author.name, 'query': self.query, 'time_invoked': timeInvoked,
- 'time_returned': timeReturned}
- self.results_list.append(result)
- self.count += 1
- after = x
- print("current count = {c}".format(c=self.count))
- self.queryEndTime = int(after.created_utc)
- if self.lastCount == self.count:
- #for n in range(0, len(self.results_list)):
- # print(self.results_list[n].fullname + " " + self.results_list[n].title)
- # print(self.results_list[n])
- print("total results: {c}".format(c=self.count))
- self.redditWorker(self.results_list)
- return
- else:
- self.lastCount = self.count
- # print("Last result (number {n}): {t} -- Time created: {c} -- Time Elapsed: {e}".format(n=self.count, t=after.title,
- # c=datetime.fromtimestamp(after.created_utc), e=(datetime.utcnow()-startTime)))
- try:
- if count < 10000:
- self.search(after=after, paginate=True)
- except praw.errors.HTTPException:
- print(praw.errors.HTTPException)
- return
- def redditWorker(self, results_list):
- if 'query' in results_list[0]: tableName = "reddit_" + results_list[0]['query']
- connection = self.connect(user="postgres", password="", db="cc")
- meta = sqlalchemy.MetaData(bind=connection, reflect=True)
- if connection.dialect.has_table(connection.connect(), tableName) == False:
- table = Table(tableName, meta,
- Column('url', String), #
- Column('title', String),
- Column('fullname', String, primary_key=True), #
- Column('sub', String), #
- Column('author', String),
- Column('query', String),
- Column('time_created', DateTime),
- Column('time_invoked', DateTime),
- Column('time_returned', DateTime),
- Column('time_comitted', DateTime),
- Column('time_taken_total', String))
- meta.create_all(connection)
- for result in results_list:
- # test = meta.tables[tableName]
- # print(table)
- row = {}
- if 'url' in result: row.update({'url': result['url']})
- if 'title' in result: row.update({'title': result['title']})
- if 'fullname' in result: row.update({'fullname': result['fullname']})
- if 'sub' in result: row.update({'sub': result['sub']})
- if 'author' in result: row.update({'author': result['author']})
- if 'created' in result: row.update({'time_created': datetime.fromtimestamp(result['created'])})
- if 'time_invoked' in result:
- row.update({'time_invoked': result['time_invoked']})
- if 'time_returned' in result:
- row.update({'time_returned': result['time_returned']})
- if datetime: row.update({'time_comitted': datetime.now()}).replace(microsecond=0)
- if 'query' in result: row.update({'query': result['query']})
- if 'time_invoked' in result and 'time_returned' in result:
- row.update({'time_taken_total': str(result['time_returned'] - result['time_invoked'])})
- if 'time_waiting' in result: row.update({'time_waiting': result['time_waiting']}).replace(microsecond=0)
- if 'time_processing' in result: row.update({'time_processing': result['time_processing']}).replace(microsecond=0)
- try:
- Session = sqlalchemy.orm.sessionmaker(bind=connection)
- session = Session()
- meta = sqlalchemy.MetaData(bind=connection, reflect=True)
- test = Table(tableName, meta, autoload=True)
- insertion = test.insert().values(row)
- session.execute(insertion)
- session.commit()
- except psycopg2.IntegrityError:
- print("psycopg2.IntegrityError")
- except sqlalchemy.exc.IntegrityError:
- print("sqlalchemy.exc.IntegrityError")
- def connect(self, user, password, db, host='localhost', port=5432):
- '''Returns a connection and a metadata object'''
- # We connect with the help of the PostgreSQL URL
- # postgresql://federer:grandestslam@localhost:5432/tennis
- url = 'postgresql://{}:{}@{}:{}/{}'
- url = url.format(user, password, host, port, db)
- # The return value of create_engine() is our connection object
- conn = sqlalchemy.create_engine(url, client_encoding='utf8')
- # We then bind the connection to MetaData()
- # meta = sqlalchemy.MetaData(bind=conn, reflect=True)
- return conn # , meta
- class SubredditLatest(object):
- """Get all available submissions within a subreddit newer than x."""
- def __init__(self, subreddit, dt):
- # master list of all available submissions
- self.total_list = []
- # subreddit must be a string of the subreddit name (e.g., "soccer")
- self.subreddit = subreddit
- # dt must be a utc datetime object
- self.dt = dt
- def __call__(self):
- self.get_submissions(self)
- return self.total_list
- def get_submissions(self, paginate=False):
- """Get limit of subreddit submissions."""
- limit = 100 # Reddit maximum limit
- if paginate is True:
- try:
- # get limit of items past the last item in the total list
- submissions = r.get_subreddit(self.subreddit).get_new(limit=limit, params={"after": self.total_list[-1].fullname})
- except IndexError:
- print(IndexError)
- return
- else:
- submissions = r.get_subreddit(self.subreddit).get_new(limit=limit)
- submissions_list = [
- # iterate through the submissions generator object
- x for x in submissions
- # add item if item.created_utc is newer than an hour ago
- if datetime.utcfromtimestamp(x.created_utc) >= self.dt
- ]
- self.total_list += submissions_list
- # if you've hit the limit, recursively run this function again to get
- # all of the available items
- if len(submissions_list) == limit:
- self.get_submissions(paginate=True)
- else:
- for x in range(0, len(self.total_list)):
- print(int(self.dt.strftime("%s")))
- print(self.total_list[x].fullname + " " + self.total_list[x].title)
- return
- if __name__ == '__main__':
- one_hour_ago = datetime.utcnow() - timedelta(hours=1)
- one_day_ago = datetime.utcnow() - timedelta(hours=24)
- one_week_ago = datetime.utcnow() - timedelta(days=7)
- one_month_ago = datetime.utcnow() - timedelta(days=31)
- one_year_ago = datetime.utcnow() - timedelta(days=365)
- five_years_ago = datetime.utcnow() - timedelta(days=1825)
- #print(SubredditLatest("all", an_hour_ago)())
- global startTime
- startTime = datetime.utcnow()
- SearchAll("test", "all", "new", five_years_ago)()
Add Comment
Please, Sign In to add comment