Advertisement
Guest User

Untitled

a guest
Jun 22nd, 2017
89
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.14 KB | None | 0 0
  1. import urllib.request
  2. import traceback
  3. import re
  4. import time
  5. from bs4 import BeautifulSoup
  6.  
  7. def isCurrentYear(dateStr):
  8. return re.search("(1[2-9]mon)|(20(0|1)[^7])", dateStr) is None
  9.  
  10. def writePostCount(postCount, filename):
  11. input_filename=open(filename, "w")
  12. for (k,v) in postCount.items():
  13. input_filename.write(k+":"+str(v)+"\n")
  14. input_filename.close()
  15. return filename
  16.  
  17. postCount = {}
  18. forums = ["590", "29", "642", "665", "44", "538"]
  19. for forum in forums:
  20. forum_pageNum = 1
  21. forum_isCurrentYear = True
  22. while forum_isCurrentYear:
  23. forum_urlstr = "https://epicmafia.com/forum/"+forum+"?page="+str(forum_pageNum)
  24. print("Scanning "+forum_urlstr+"...")
  25. while True:
  26. try:
  27. with urllib.request.urlopen(forum_urlstr) as forum_url:
  28. forum_page = forum_url.read()
  29. except urllib.error.HTTPError:
  30. continue
  31. break
  32. forum_soup = BeautifulSoup(forum_page, "html.parser")
  33. forum_soup.prettify("latin-1")
  34. threads = forum_soup.find_all(name="a", class_="topic-title")
  35. for thread in threads:
  36. lastpost = thread.parent.parent.parent.find(class_="lastpost").find(class_="sg").text
  37. #print(lastpost)
  38. if not isCurrentYear(lastpost):
  39. forum_isCurrentYear = False
  40. break
  41. #continue
  42. thread_pageNum = -1
  43. while True:
  44. thread_urlstr = "https://epicmafia.com"+thread.get("href")+("" if thread_pageNum==-1 else ("?page="+str(thread_pageNum)))
  45. while True:
  46. try:
  47. with urllib.request.urlopen(thread_urlstr) as thread_url:
  48. thread_page = thread_url.read()
  49. except urllib.error.HTTPError:
  50. continue
  51. break
  52. thread_soup = BeautifulSoup(thread_page, "html.parser")
  53. thread_soup.prettify("latin-1")
  54. posts = thread_soup.find_all(class_="postuser")
  55. print("Scanning "+thread_urlstr+"...")
  56. for post in posts:
  57. userlink = post.find(name="a", class_="tt")
  58. created_at = post.find(class_="created_at").text
  59. #print(created_at)
  60. if not isCurrentYear(created_at):
  61. continue
  62. if userlink is None: #deleted user
  63. continue
  64. username = userlink.text
  65. if username not in postCount:
  66. postCount[username] = 0
  67. postCount[username] += 1
  68. #print(username)
  69. if thread_soup.find(class_="pagenav") is None:
  70. break
  71. thread_pageNum = int(thread_soup.find(class_="pagenav").find(class_="selected").contents[0].text)
  72. if thread_pageNum==1:
  73. break
  74. thread_pageNum -= 1
  75. forum_pageNum += 1
  76. writePostCount(postCount, "test.txt")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement