Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import requests #import library to fetch the site
- from bs4 import BeautifulSoup #import library to parse site content
- while True: #make it so that you can get the post count for multiple threads without reopening the program
- while True: #loop until the user enters a valid URL
- try:
- root = input('Enter base url: ') #tell the user to enter the base URL
- site = requests.get(root) #request the site
- siteContent = BeautifulSoup(site.content, 'html.parser') #get the site HTML content
- break
- except:
- continue
- try:
- pages = int(siteContent.find("a", {"rel": "last"})['data-page']) #get the number of pages to go through, based on the page directory
- except TypeError:
- pages = 1
- #context: each post has a unique ID. These IDs are in chronological order, so they can be used to get a subset of a thread.
- try: #ask for start ID (inclusive)
- startPoint = int(input("Enter start ID (optional): "))
- except:
- startPoint = 0
- try: #ask for end ID (inclusive)
- endPoint = int(input("Enter end ID (optional): "))
- except:
- endPoint = 10000000
- userActivity = {}
- longestUser = 0
- for page in range(1, pages+1): #cycle through the pages
- newURL = root + '?page=' + str(page) #get the URL for the current page
- currentPage = requests.get(newURL) #request new page
- currentContent = BeautifulSoup(currentPage.content, 'html.parser') #get new page content
- IDs = currentContent.find_all(lambda tag: tag.name == 'a' and tag.get('class') == ['ipsType_blendLinks']) #fetch all IDs
- IDList = list(map(lambda x: int(x.get('href')[len(root)+24:]), IDs))
- if IDList[0] > endPoint:
- break
- elif IDList[len(IDList)-1] < startPoint:
- continue
- users = currentContent.find_all("a", {"class": "ipsType_break"}) #find all usernames in the thread
- users = list(map(lambda x: x.get_text(), users))[1:] #get the username text, and remove the first username (since it's the OP's name)
- userNums = list(filter(lambda x: startPoint <= IDList[x] <= endPoint, range(len(users)))) #filter out invalid posts
- for userNum in userNums:
- user = users[userNum] #get the relevant user
- if(len(user) > longestUser): #if a user has the longest name, set it as the new longest name
- longestUser = len(user)
- try: #attempt to add one to the associated user, and if the user is not in the dictionary yet, intialize a key for them with an intial value of one
- userActivity[user] += 1
- except KeyError:
- userActivity[user] = 1
- for i in sorted (userActivity.keys()): #cycle through all the listed users (sorted alphabetically)
- toPrint = i + ' '*(longestUser-len(i)) + ' | ' + str(userActivity[i]) #create a row to print and make sure that they're lined up neatly
- print(toPrint) #print the line
- toPrint = "Total" + ' '*(longestUser-5) + ' | ' + str(sum(userActivity.values())) #get the total number of posts
- print(toPrint)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement