Advertisement
Guest User

Untitled

a guest
Jul 28th, 2017
56
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.65 KB | None | 0 0
  1. import sys
  2. reload(sys)
  3. sys.setdefaultencoding('utf-8')
  4. import urllib2
  5. from retry import retry
  6. try:
  7. from BeautifulSoup import BeautifulSoup
  8. except ImportError:
  9. from bs4 import BeautifulSoup
  10.  
  11. if len(sys.argv) < 2:
  12. print 'Usage:',sys.argv[0],'file_with_subforums_urls [file_to_output_raw_data_to]'
  13. sys.exit()
  14.  
  15. topics = []
  16. posts = []
  17. stats = {}
  18.  
  19. with open(sys.argv[1]) as f:
  20. forums = f.readlines()
  21. forums = [x.strip() for x in forums]
  22.  
  23. print 'Got',len(forums),'forums to process'
  24.  
  25. @retry(urllib2.URLError,tries=3,delay=2,backoff=2)
  26. def urlopen_with_retry(url):
  27. return urllib2.urlopen(url)
  28.  
  29. for forum in forums:
  30. next_page = forum
  31. print "Processing",forum
  32. while next_page:
  33. try:
  34. response = urlopen_with_retry(next_page)
  35. page = response.read()
  36. parsed_page = BeautifulSoup(page,'lxml')
  37. next_page = ''
  38. for link in parsed_page.body.find_all('a'):
  39. if (not next_page) and link.text == "Next" and link.parent.name == 'b' and link.parent.parent.name == 'td' and link.parent.parent.get('class')[0] == 'gensmall':
  40. next_page = forum.split('?')[0] + '?' + link['href'].split('?')[1]
  41. for link in parsed_page.body.find_all('a',class_="topictitle"):
  42. if link.has_attr('href') and ('viewtopic.php' in link['href']) and link.parent.name == 'td' and link.parent.get('class')[0] == 'row1':
  43. topics.append(forum.split('?')[0].replace('viewforum','viewtopic') + '?' + link['href'].split('?')[1])
  44. except:
  45. print "Skipped due to error in",forum
  46. next_page = ''
  47.  
  48. print 'Got',len(topics),'topics to process'
  49.  
  50. for topic in topics:
  51. next_page = topic
  52. print "Processing",topic
  53. while next_page:
  54. try:
  55. response = urlopen_with_retry(next_page)
  56. page = response.read()
  57. parsed_page = BeautifulSoup(page,'lxml')
  58. next_page = ''
  59. for link in parsed_page.body.find_all('a'):
  60. if (not next_page) and link.text == "Next" and link.parent.name == 'b' and link.parent.parent.name == 'td' and link.parent.parent.get('class')[0] == 'gensmall':
  61. next_page = topic.split('?')[0] + '?' + link['href'].split('?')[1]
  62. for td in parsed_page.body.find_all('td',class_="gensmall"):
  63. for child in td.children:
  64. if child.name == 'div' and child.has_attr('style') and child['style'] == 'float: right;':
  65. posts.append(child.text.replace('Posted: ',''))
  66. except:
  67. print "Skipped due to error in",topic
  68. next_page = ''
  69.  
  70. print 'Got',len(posts),'posts'
  71.  
  72. if len(sys.argv) == 3:
  73. output = open(sys.argv[2],'w')
  74. for post in posts:
  75. output.write("%s\n" % post)
  76.  
  77. for post in posts:
  78. post = post.split(' ')
  79. if not stats.has_key(post[3]):
  80. stats[post[3]] = 0
  81. stats[post[3]] += 1
  82.  
  83. print "Results as tsv:"
  84.  
  85. for key in sorted(stats.iterkeys()):
  86. print "%s\t%s" % (key,stats[key])
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement