Guest User

Untitled

a guest
Aug 30th, 2016
116
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 12.02 KB | None | 0 0
  1. import sys
  2. from datetime import datetime, timedelta
  3. import time
  4. import praw
  5. import psycopg2
  6. import sqlalchemy
  7. from sqlalchemy import Table, Column, Integer, String, Float, DateTime
  8. import sqlalchemy.orm
  9.  
  10.  
  11. user_agent = "cc test"
  12. app_id = "
  13. app_secret = '
  14. app_uri = 'https://127.0.0.1:65010/authorize_callback'
  15. app_account_code =
  16. app_refresh =
  17. app_scopes = 'account creddits edit flair history identity livemanage modconfig ' \
  18.             'modcontributors modflair modlog modothers modposts modself modwiki ' \
  19.             'mysubreddits privatemessages read report save submit subscribe vote wikiedit wikiread'
  20. r = praw.Reddit(user_agent=user_agent)
  21. r.set_oauth_app_info(app_id, app_secret, app_uri)
  22. r.refresh_access_information(app_refresh)
  23. sys.setrecursionlimit(1000)
  24.  
  25.  
  26. class SearchAll(object):
  27.  
  28.    def __init__(self, query, url, sort, dt):
  29.        # master list of all available submissions
  30.        self.results_list = []
  31.  
  32.        self.url = url
  33.  
  34.        self.query = query
  35.  
  36.        self.sort = sort
  37.  
  38.        self.count = 0
  39.  
  40.        self.dt = dt
  41.  
  42.        self.queryStartTime = int(time.mktime(self.dt.timetuple()))
  43.  
  44.        self.queryEndTime = int(time.mktime(datetime.utcnow().timetuple()))
  45.  
  46.        self.current = 0
  47.  
  48.        self.lastCount = 0
  49.  
  50.        self.resultDict = {}
  51.  
  52.    def __call__(self):
  53.        self.search(self)
  54.        return self.results_list
  55.  
  56.    def search(self, after=None, paginate=False):
  57.        limit = None
  58.        params = {"q": self.query, "sort": self.sort}
  59.        searchMode = 2
  60.        #1 = normal search
  61.        #2 = cloudsearch
  62.  
  63.        cloudSearchQuery = """(and text:'{q}' timestamp:{csq1}..{csq2})""".format(q=self.query,
  64.                                                                                  csq1=self.queryStartTime,
  65.                                                                                  csq2=self.queryEndTime)
  66.        if paginate is True:
  67.            try:
  68.                if searchMode == 1:
  69.                    # results = r.get_subreddit(self.subreddit).get_new(limit=limit,
  70.                    #                       params={"after": self.total_list[-1].fullname})
  71.                    # results = r.search(self.query, limit=limit, sort='new', after=self.results_list[-1].fullname)
  72.                    params["after"] = "t3_" + str(after.id)
  73.                    results = r.get_content("https://www.reddit.com/r/all/search", params=params, limit=100)
  74.  
  75.                if searchMode == 2:
  76.                    #queryURL = "https://www.reddit.com/r/all/search?q=(and text:'{q}' time" \
  77.                    #           "stamp:{csq1}..{csq2})&sort=new&syntax=cloudsearch".format(q=self.query,
  78.                    #                                                                      csq1=self.queryStartTime,
  79.                    #                                                                      csq2=self.queryEndTime)
  80.                    timeInvoked = datetime.now()
  81.                    results = r.search(cloudSearchQuery, subreddit="all", sort="new", limit=limit, syntax="cloudsearch")
  82.                    timeReturned = datetime.now()
  83.            except IndexError:
  84.                print(IndexError)
  85.                return
  86.        else:
  87.            if searchMode == 1:
  88.                # results = r.search(self.query, limit=limit, sort='new')
  89.                results = r.get_content("https://www.reddit.com/r/all/search", params=params, limit=100)
  90.  
  91.            if searchMode == 2:
  92.                #queryURL = "https://www.reddit.com/r/all/search?q=(and text:'{q}' time" \
  93.                #           "stamp:{csq1}..{csq2})&sort=new&syntax=cloudsearch".format(q=self.query,
  94.                #                                                                    csq1=self.queryStartTime,
  95.                #                                                                    csq2=self.queryEndTime)
  96.                #print(cloudSearchQuery)
  97.                #print(queryURL)
  98.                #results = r.get_content(url=csQ, limit=limit)
  99.                timeInvoked = datetime.now()
  100.                results = r.search(cloudSearchQuery, subreddit="all", sort="new", limit=limit, syntax="cloudsearch")
  101.                timeReturned = datetime.now()
  102.  
  103.        if searchMode == 1:
  104.            for x in results:
  105.                if datetime.utcfromtimestamp(x.created_utc) >= self.dt.replace(microsecond=0):
  106.                    self.results_list.append(x)
  107.                    self.count += 1
  108.                    after = x
  109.  
  110.        if searchMode == 2:
  111.            for x in results:
  112.  
  113.                result = {'created': x.created_utc, 'fullname': x.fullname, 'url': x.url, 'title': x.title,
  114.                          'sub': x.domain, 'author': x.author.name, 'query': self.query, 'time_invoked': timeInvoked,
  115.                          'time_returned': timeReturned}
  116.                self.results_list.append(result)
  117.                self.count += 1
  118.                after = x
  119.            print("current count = {c}".format(c=self.count))
  120.        self.queryEndTime = int(after.created_utc)
  121.        if self.lastCount == self.count:
  122.            #for n in range(0, len(self.results_list)):
  123.            #    print(self.results_list[n].fullname + " " + self.results_list[n].title)
  124.            #    print(self.results_list[n])
  125.            print("total results: {c}".format(c=self.count))
  126.            self.redditWorker(self.results_list)
  127.            return
  128.        else:
  129.            self.lastCount = self.count
  130.           # print("Last result (number {n}): {t} -- Time created: {c} -- Time Elapsed: {e}".format(n=self.count, t=after.title,
  131.           #                                                     c=datetime.fromtimestamp(after.created_utc), e=(datetime.utcnow()-startTime)))
  132.            try:
  133.                if count < 10000:
  134.                    self.search(after=after, paginate=True)
  135.            except praw.errors.HTTPException:
  136.                print(praw.errors.HTTPException)
  137.                return
  138.  
  139.  
  140.    def redditWorker(self, results_list):
  141.        if 'query' in results_list[0]: tableName = "reddit_" + results_list[0]['query']
  142.        connection = self.connect(user="postgres", password="", db="cc")
  143.        meta = sqlalchemy.MetaData(bind=connection, reflect=True)
  144.        if connection.dialect.has_table(connection.connect(), tableName) == False:
  145.            table = Table(tableName, meta,
  146.                         Column('url', String),  #
  147.                         Column('title', String),
  148.                         Column('fullname', String, primary_key=True),  #
  149.                         Column('sub', String),  #
  150.                         Column('author', String),
  151.                         Column('query', String),
  152.                         Column('time_created', DateTime),
  153.                         Column('time_invoked', DateTime),
  154.                         Column('time_returned', DateTime),
  155.                         Column('time_comitted', DateTime),
  156.                         Column('time_taken_total', String))
  157.            meta.create_all(connection)
  158.        for result in results_list:
  159.  
  160.            # test = meta.tables[tableName]
  161.           # print(table)
  162.            row = {}
  163.  
  164.            if 'url' in result: row.update({'url': result['url']})
  165.            if 'title' in result: row.update({'title': result['title']})
  166.            if 'fullname' in result: row.update({'fullname': result['fullname']})
  167.            if 'sub' in result: row.update({'sub': result['sub']})
  168.            if 'author' in result: row.update({'author': result['author']})
  169.            if 'created' in result: row.update({'time_created':  datetime.fromtimestamp(result['created'])})
  170.  
  171.            if 'time_invoked' in result:
  172.                row.update({'time_invoked': result['time_invoked']})
  173.            if 'time_returned' in result:
  174.                row.update({'time_returned': result['time_returned']})
  175.  
  176.            if datetime: row.update({'time_comitted': datetime.now()}).replace(microsecond=0)
  177.  
  178.            if 'query' in result: row.update({'query': result['query']})
  179.  
  180.            if 'time_invoked' in result and 'time_returned' in result:
  181.                row.update({'time_taken_total': str(result['time_returned'] - result['time_invoked'])})
  182.            if 'time_waiting' in result: row.update({'time_waiting': result['time_waiting']}).replace(microsecond=0)
  183.            if 'time_processing' in result: row.update({'time_processing': result['time_processing']}).replace(microsecond=0)
  184.  
  185.            try:
  186.                Session = sqlalchemy.orm.sessionmaker(bind=connection)
  187.                session = Session()
  188.                meta = sqlalchemy.MetaData(bind=connection, reflect=True)
  189.                test = Table(tableName, meta, autoload=True)
  190.                insertion = test.insert().values(row)
  191.                session.execute(insertion)
  192.                session.commit()
  193.            except psycopg2.IntegrityError:
  194.                print("psycopg2.IntegrityError")
  195.            except sqlalchemy.exc.IntegrityError:
  196.                print("sqlalchemy.exc.IntegrityError")
  197.  
  198.    def connect(self, user, password, db, host='localhost', port=5432):
  199.        '''Returns a connection and a metadata object'''
  200.        # We connect with the help of the PostgreSQL URL
  201.        # postgresql://federer:grandestslam@localhost:5432/tennis
  202.        url = 'postgresql://{}:{}@{}:{}/{}'
  203.        url = url.format(user, password, host, port, db)
  204.  
  205.        # The return value of create_engine() is our connection object
  206.        conn = sqlalchemy.create_engine(url, client_encoding='utf8')
  207.  
  208.        # We then bind the connection to MetaData()
  209.        # meta = sqlalchemy.MetaData(bind=conn, reflect=True)
  210.  
  211.        return conn  # , meta
  212.  
  213.  
  214. class SubredditLatest(object):
  215.    """Get all available submissions within a subreddit newer than x."""
  216.  
  217.    def __init__(self, subreddit, dt):
  218.  
  219.        # master list of all available submissions
  220.        self.total_list = []
  221.  
  222.        # subreddit must be a string of the subreddit name (e.g., "soccer")
  223.        self.subreddit = subreddit
  224.  
  225.        # dt must be a utc datetime object
  226.        self.dt = dt
  227.  
  228.  
  229.    def __call__(self):
  230.        self.get_submissions(self)
  231.        return self.total_list
  232.  
  233.    def get_submissions(self, paginate=False):
  234.        """Get limit of subreddit submissions."""
  235.        limit = 100  # Reddit maximum limit
  236.  
  237.        if paginate is True:
  238.            try:
  239.                # get limit of items past the last item in the total list
  240.                submissions = r.get_subreddit(self.subreddit).get_new(limit=limit, params={"after": self.total_list[-1].fullname})
  241.            except IndexError:
  242.                print(IndexError)
  243.                return
  244.        else:
  245.            submissions = r.get_subreddit(self.subreddit).get_new(limit=limit)
  246.  
  247.        submissions_list = [
  248.            # iterate through the submissions generator object
  249.            x for x in submissions
  250.  
  251.            # add item if item.created_utc is newer than an hour ago
  252.            if datetime.utcfromtimestamp(x.created_utc) >= self.dt
  253.        ]
  254.        self.total_list += submissions_list
  255.  
  256.        # if you've hit the limit, recursively run this function again to get
  257.        # all of the available items
  258.        if len(submissions_list) == limit:
  259.            self.get_submissions(paginate=True)
  260.        else:
  261.            for x in range(0, len(self.total_list)):
  262.                print(int(self.dt.strftime("%s")))
  263.                print(self.total_list[x].fullname + " " + self.total_list[x].title)
  264.            return
  265.  
  266. if __name__ == '__main__':
  267.    one_hour_ago = datetime.utcnow() - timedelta(hours=1)
  268.    one_day_ago = datetime.utcnow() - timedelta(hours=24)
  269.    one_week_ago = datetime.utcnow() - timedelta(days=7)
  270.    one_month_ago = datetime.utcnow() - timedelta(days=31)
  271.    one_year_ago = datetime.utcnow() - timedelta(days=365)
  272.    five_years_ago = datetime.utcnow() - timedelta(days=1825)
  273.    #print(SubredditLatest("all", an_hour_ago)())
  274.    global startTime
  275.    startTime = datetime.utcnow()
  276.    SearchAll("test", "all", "new", five_years_ago)()
Add Comment
Please, Sign In to add comment