import urllib.request import traceback import re import time from bs4 import BeautifulSoup def isCurrentYear(dateStr): return re.search("(1[2-9]mon)|(20(0|1)[^7])", dateStr) is None def writePostCount(postCount, filename): input_filename=open(filename, "w") for (k,v) in postCount.items(): input_filename.write(k+":"+str(v)+"\n") input_filename.close() return filename postCount = {} forums = ["590", "29", "642", "665", "44", "538"] for forum in forums: forum_pageNum = 1 forum_isCurrentYear = True while forum_isCurrentYear: forum_urlstr = "https://epicmafia.com/forum/"+forum+"?page="+str(forum_pageNum) print("Scanning "+forum_urlstr+"...") while True: try: with urllib.request.urlopen(forum_urlstr) as forum_url: forum_page = forum_url.read() except urllib.error.HTTPError: continue break forum_soup = BeautifulSoup(forum_page, "html.parser") forum_soup.prettify("latin-1") threads = forum_soup.find_all(name="a", class_="topic-title") for thread in threads: lastpost = thread.parent.parent.parent.find(class_="lastpost").find(class_="sg").text #print(lastpost) if not isCurrentYear(lastpost): forum_isCurrentYear = False break #continue thread_pageNum = -1 while True: thread_urlstr = "https://epicmafia.com"+thread.get("href")+("" if thread_pageNum==-1 else ("?page="+str(thread_pageNum))) while True: try: with urllib.request.urlopen(thread_urlstr) as thread_url: thread_page = thread_url.read() except urllib.error.HTTPError: continue break thread_soup = BeautifulSoup(thread_page, "html.parser") thread_soup.prettify("latin-1") posts = thread_soup.find_all(class_="postuser") print("Scanning "+thread_urlstr+"...") for post in posts: userlink = post.find(name="a", class_="tt") created_at = post.find(class_="created_at").text #print(created_at) if not isCurrentYear(created_at): continue if userlink is None: #deleted user continue username = userlink.text if username not in postCount: postCount[username] = 0 postCount[username] += 1 #print(username) if thread_soup.find(class_="pagenav") is None: break thread_pageNum = int(thread_soup.find(class_="pagenav").find(class_="selected").contents[0].text) if thread_pageNum==1: break thread_pageNum -= 1 forum_pageNum += 1 writePostCount(postCount, "test.txt")