Untitled

import sys
from datetime import datetime, timedelta
import time
import praw
import psycopg2
import sqlalchemy
from sqlalchemy import Table, Column, Integer, String, Float, DateTime
import sqlalchemy.orm


user_agent = "cc test"
app_id = "
app_secret = '
app_uri = 'https://127.0.0.1:65010/authorize_callback'
app_account_code =
app_refresh =
app_scopes = 'account creddits edit flair history identity livemanage modconfig ' \
             'modcontributors modflair modlog modothers modposts modself modwiki ' \
             'mysubreddits privatemessages read report save submit subscribe vote wikiedit wikiread'
r = praw.Reddit(user_agent=user_agent)
r.set_oauth_app_info(app_id, app_secret, app_uri)
r.refresh_access_information(app_refresh)
sys.setrecursionlimit(1000)


class SearchAll(object):

    def __init__(self, query, url, sort, dt):
        # master list of all available submissions
        self.results_list = []

        self.url = url

        self.query = query

        self.sort = sort

        self.count = 0

        self.dt = dt

        self.queryStartTime = int(time.mktime(self.dt.timetuple()))

        self.queryEndTime = int(time.mktime(datetime.utcnow().timetuple()))

        self.current = 0

        self.lastCount = 0

        self.resultDict = {}

    def __call__(self):
        self.search(self)
        return self.results_list

    def search(self, after=None, paginate=False):
        limit = None
        params = {"q": self.query, "sort": self.sort}
        searchMode = 2
        #1 = normal search
        #2 = cloudsearch

        cloudSearchQuery = """(and text:'{q}' timestamp:{csq1}..{csq2})""".format(q=self.query,
                                                                                  csq1=self.queryStartTime,
                                                                                  csq2=self.queryEndTime)
        if paginate is True:
            try:
                if searchMode == 1:
                    # results = r.get_subreddit(self.subreddit).get_new(limit=limit,
                    #                       params={"after": self.total_list[-1].fullname})
                    # results = r.search(self.query, limit=limit, sort='new', after=self.results_list[-1].fullname)
                    params["after"] = "t3_" + str(after.id)
                    results = r.get_content("https://www.reddit.com/r/all/search", params=params, limit=100)

                if searchMode == 2:
                    #queryURL = "https://www.reddit.com/r/all/search?q=(and text:'{q}' time" \
                    #           "stamp:{csq1}..{csq2})&sort=new&syntax=cloudsearch".format(q=self.query,
                    #                                                                      csq1=self.queryStartTime,
                    #                                                                      csq2=self.queryEndTime)
                    timeInvoked = datetime.now()
                    results = r.search(cloudSearchQuery, subreddit="all", sort="new", limit=limit, syntax="cloudsearch")
                    timeReturned = datetime.now()
            except IndexError:
                print(IndexError)
                return
        else:
            if searchMode == 1:
                # results = r.search(self.query, limit=limit, sort='new')
                results = r.get_content("https://www.reddit.com/r/all/search", params=params, limit=100)

            if searchMode == 2:
                #queryURL = "https://www.reddit.com/r/all/search?q=(and text:'{q}' time" \
                #           "stamp:{csq1}..{csq2})&sort=new&syntax=cloudsearch".format(q=self.query,
                #                                                                    csq1=self.queryStartTime,
                #                                                                    csq2=self.queryEndTime)
                #print(cloudSearchQuery)
                #print(queryURL)
                #results = r.get_content(url=csQ, limit=limit)
                timeInvoked = datetime.now()
                results = r.search(cloudSearchQuery, subreddit="all", sort="new", limit=limit, syntax="cloudsearch")
                timeReturned = datetime.now()

        if searchMode == 1:
            for x in results:
                if datetime.utcfromtimestamp(x.created_utc) >= self.dt.replace(microsecond=0):
                    self.results_list.append(x)
                    self.count += 1
                    after = x

        if searchMode == 2:
            for x in results:

                result = {'created': x.created_utc, 'fullname': x.fullname, 'url': x.url, 'title': x.title,
                          'sub': x.domain, 'author': x.author.name, 'query': self.query, 'time_invoked': timeInvoked,
                          'time_returned': timeReturned}
                self.results_list.append(result)
                self.count += 1
                after = x
            print("current count = {c}".format(c=self.count))
        self.queryEndTime = int(after.created_utc)
        if self.lastCount == self.count:
            #for n in range(0, len(self.results_list)):
            #    print(self.results_list[n].fullname + " " + self.results_list[n].title)
            #    print(self.results_list[n])
            print("total results: {c}".format(c=self.count))
            self.redditWorker(self.results_list)
            return
        else:
            self.lastCount = self.count
           # print("Last result (number {n}): {t} -- Time created: {c} -- Time Elapsed: {e}".format(n=self.count, t=after.title,
           #                                                     c=datetime.fromtimestamp(after.created_utc), e=(datetime.utcnow()-startTime)))
            try:
                if count < 10000:
                    self.search(after=after, paginate=True)
            except praw.errors.HTTPException:
                print(praw.errors.HTTPException)
                return


    def redditWorker(self, results_list):
        if 'query' in results_list[0]: tableName = "reddit_" + results_list[0]['query']
        connection = self.connect(user="postgres", password="", db="cc")
        meta = sqlalchemy.MetaData(bind=connection, reflect=True)
        if connection.dialect.has_table(connection.connect(), tableName) == False:
            table = Table(tableName, meta,
                         Column('url', String),  #
                         Column('title', String),
                         Column('fullname', String, primary_key=True),  #
                         Column('sub', String),  #
                         Column('author', String),
                         Column('query', String),
                         Column('time_created', DateTime),
                         Column('time_invoked', DateTime),
                         Column('time_returned', DateTime),
                         Column('time_comitted', DateTime),
                         Column('time_taken_total', String))
            meta.create_all(connection)
        for result in results_list:

            # test = meta.tables[tableName]
           # print(table)
            row = {}

            if 'url' in result: row.update({'url': result['url']})
            if 'title' in result: row.update({'title': result['title']})
            if 'fullname' in result: row.update({'fullname': result['fullname']})
            if 'sub' in result: row.update({'sub': result['sub']})
            if 'author' in result: row.update({'author': result['author']})
            if 'created' in result: row.update({'time_created':  datetime.fromtimestamp(result['created'])})

            if 'time_invoked' in result:
                row.update({'time_invoked': result['time_invoked']})
            if 'time_returned' in result:
                row.update({'time_returned': result['time_returned']})

            if datetime: row.update({'time_comitted': datetime.now()}).replace(microsecond=0)

            if 'query' in result: row.update({'query': result['query']})

            if 'time_invoked' in result and 'time_returned' in result:
                row.update({'time_taken_total': str(result['time_returned'] - result['time_invoked'])})
            if 'time_waiting' in result: row.update({'time_waiting': result['time_waiting']}).replace(microsecond=0)
            if 'time_processing' in result: row.update({'time_processing': result['time_processing']}).replace(microsecond=0)

            try:
                Session = sqlalchemy.orm.sessionmaker(bind=connection)
                session = Session()
                meta = sqlalchemy.MetaData(bind=connection, reflect=True)
                test = Table(tableName, meta, autoload=True)
                insertion = test.insert().values(row)
                session.execute(insertion)
                session.commit()
            except psycopg2.IntegrityError:
                print("psycopg2.IntegrityError")
            except sqlalchemy.exc.IntegrityError:
                print("sqlalchemy.exc.IntegrityError")

    def connect(self, user, password, db, host='localhost', port=5432):
        '''Returns a connection and a metadata object'''
        # We connect with the help of the PostgreSQL URL
        # postgresql://federer:grandestslam@localhost:5432/tennis
        url = 'postgresql://{}:{}@{}:{}/{}'
        url = url.format(user, password, host, port, db)

        # The return value of create_engine() is our connection object
        conn = sqlalchemy.create_engine(url, client_encoding='utf8')

        # We then bind the connection to MetaData()
        # meta = sqlalchemy.MetaData(bind=conn, reflect=True)

        return conn  # , meta


class SubredditLatest(object):
    """Get all available submissions within a subreddit newer than x."""

    def __init__(self, subreddit, dt):

        # master list of all available submissions
        self.total_list = []

        # subreddit must be a string of the subreddit name (e.g., "soccer")
        self.subreddit = subreddit

        # dt must be a utc datetime object
        self.dt = dt


    def __call__(self):
        self.get_submissions(self)
        return self.total_list

    def get_submissions(self, paginate=False):
        """Get limit of subreddit submissions."""
        limit = 100  # Reddit maximum limit

        if paginate is True:
            try:
                # get limit of items past the last item in the total list
                submissions = r.get_subreddit(self.subreddit).get_new(limit=limit, params={"after": self.total_list[-1].fullname})
            except IndexError:
                print(IndexError)
                return
        else:
            submissions = r.get_subreddit(self.subreddit).get_new(limit=limit)

        submissions_list = [
            # iterate through the submissions generator object
            x for x in submissions

            # add item if item.created_utc is newer than an hour ago
            if datetime.utcfromtimestamp(x.created_utc) >= self.dt
        ]
        self.total_list += submissions_list

        # if you've hit the limit, recursively run this function again to get
        # all of the available items
        if len(submissions_list) == limit:
            self.get_submissions(paginate=True)
        else:
            for x in range(0, len(self.total_list)):
                print(int(self.dt.strftime("%s")))
                print(self.total_list[x].fullname + " " + self.total_list[x].title)
            return

if __name__ == '__main__':
    one_hour_ago = datetime.utcnow() - timedelta(hours=1)
    one_day_ago = datetime.utcnow() - timedelta(hours=24)
    one_week_ago = datetime.utcnow() - timedelta(days=7)
    one_month_ago = datetime.utcnow() - timedelta(days=31)
    one_year_ago = datetime.utcnow() - timedelta(days=365)
    five_years_ago = datetime.utcnow() - timedelta(days=1825)
    #print(SubredditLatest("all", an_hour_ago)())
    global startTime
    startTime = datetime.utcnow()
    SearchAll("test", "all", "new", five_years_ago)()