Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- """
- The goal of this program is to:
- 1) Find top submissions from multiple subreddits
- 2) Look through the comments that contain links
- 3) Sort by score
- 4) Document Username, Link, Link Context, Parent Comment
- 5) Store information to file
- """
- # Importing necessary modules (For the most part)
- import praw
- import logging
- import re
- from pprint import pprint
- import requests
- import json
- # Adds logging to the program, tells when information is being requested
- handler = logging.StreamHandler()
- handler.setLevel(logging.DEBUG)
- logger = logging.getLogger("prawcore")
- logger.setLevel(logging.DEBUG)
- logger.addHandler(handler)
- # Configures how we will view logs
- logging.basicConfig(level=logging.DEBUG, format=' %(asctime)s -%(levelname)s - %(message)s')
- logging.getLogger().setLevel(logging.INFO)
- # Initialize Praw, the Reddit API Wrapper
- reddit = praw.Reddit(client_id='pQr2bTUsJzvAYA',
- client_secret='hNsIBr42jlcVWy8_-TFHZAsOb08',
- password='ThereIsNoHopeRight',
- username='LeftcomGANG',
- user_agent="Linux : 7NgvlaWfS9R0nA : 1.0 (by /u/Nighthawk153)")
- # It's supposed to be read only but I can't get it to be dat way
- print(reddit.read_only)
- def getSubComments(comment, allComments, verbose=True):
- """
- Given a comment, it will look through every reply and subcomment in the
- comment thread
- :param comment: The comment object/ID
- :param allComments: The current list of comments looked at
- :param verbose: True/False, decides if program will output logs of comments
- :return:
- """
- allComments.append(comment)
- if not hasattr(comment, "replies"):
- replies = comment.comments()
- if verbose: print("fetching (" + str(len(allComments)) + " comments fetched total)")
- else:
- replies = comment.replies
- for child in replies:
- getSubComments(child, allComments, verbose=verbose)
- def getAll(r, submissionId, verbose=True):
- """
- Gets all comments from a thread
- :param r: The reddit module / object we are using. Gathered from praw
- :param submissionId: The current thread being scraped
- :param verbose: Decides if the program outputs the comments found
- :return: Returns a list of every comment from the thread
- """
- submission = r.submission(submissionId)
- comments = submission.comments
- commentsList = []
- #For every comment in the section, we will look at every subcomment available
- for comment in comments:
- getSubComments(comment, commentsList, verbose=verbose)
- print(len(commentsList))
- #Returns the list of comments from the thread
- return commentsList
- # Creates a non-filtered and non-purified list of links, needs to be scrubbed
- def CreateLinkList(CommentList):
- LinkList = []
- counter = 0
- failedcounter = 0
- list = [".com", ".org", ".net"]
- for comment in CommentList:
- try:
- for word in comment.body.split():
- for Target in list:
- if Target in word:
- LinkList.append(word)
- counter += 1
- except:
- failedcounter += 1
- logging.info("Failed: " + str(failedcounter))
- logging.info("Worked: " + str(counter))
- return LinkList
- def CreateLog(list,logfile):
- """
- Given the list of prurified and scrubbed links, it will now write this list of links
- to a file named Links.txt
- :param list: The list of links we are writing to the file
- :return: Returns nothing
- """
- logging.info("Writing links to file.... Link.txt")
- for item in list:
- print(item)
- LinkLog.write(item)
- LinkLog.write("\n\n")
- LinkLog.close()
- logging.info("Successful in writing to document!")
- def PurifyLinkList(LinkList):
- """
- Uses Regex to help filter and find links from a text.
- :param LinkList: The list of comments that contain links
- :return: Returns a list of clickable and working links
- """
- URLRegex = re.compile("(?P<url>https?://[^\s]+)")
- LinkResults = []
- for link in LinkList:
- try:
- URL = URLRegex.search(link)
- PureLink = URL.group()
- LinkResults.append(PureLink)
- except:
- continue
- return LinkResults
- LeftistList = ['esist']
- commentchoice = False
- threadchoice = True
- threadamount = 10
- totalcomments = []
- threadlinks = []
- threadlist = []
- threadcomments = []
- for sub in LeftistList:
- subreddit = reddit.subreddit(str(sub))
- #The program will loop through each subreddit title, and look through it's comments
- try:
- for thread in subreddit.hot(limit=10):
- threadlist.append(thread)
- except:
- logging.info("Something went wrong with this subreddit")
- continue
- #Now threadlist should be full of thread objects. 600~ or so.
- logging.info("Threadlist is " + str(len(threadlist)) + " items long")
- logging.info("Threadlist items: " + str(len(threadlist)))
- #threadlist is now a very long list full of reddit thread objects
- #The program will now go through each thread --> Read each comment --> Read each subcomment
- #Look for links, and add them to the Linklist.
- for thread in threadlist:
- if threadchoice == True:
- threadlinks.append(thread.url)
- if commentchoice == True:
- for comment in getAll(reddit, thread):
- threadcomments.append(comment)
- LinkLog = open("Link.txt", "w")
- if threadchoice == True:
- CreateLog(threadlinks,LinkLog)
- if commentchoice == True:
- LinkList = CreateLinkList(threadcomments)
- Results = PurifyLinkList(LinkList)
- CreateLog(Results,LinkLog)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement