Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import urllib2
- import re;
- import matplotlib.pyplot as plt
- from HTMLParser import HTMLParser
- POST_REGEX = "<div class=\"content\">(.+?)<\/div>";
- NUMBER_REGEX = "\d+"
- URL = "https://slo-tech.com/forum/t158675"
- PER_PAGE = 50;
- class MLStripper(HTMLParser):
- def __init__(self):
- self.reset()
- self.fed = []
- def handle_data(self, d):
- self.fed.append(d)
- def get_data(self):
- return ''.join(self.fed)
- def strip_tags(html):
- s = MLStripper()
- s.feed(html)
- return s.get_data()
- def parse_page(body):
- ints = [];
- posts = re.findall(POST_REGEX, body);
- for post in posts:
- num = re.search(NUMBER_REGEX, strip_tags(post));
- if num:
- integer = int(num.group());
- if integer < 10000 and integer > -10000:
- ints.append(integer)
- return ints
- result = [];
- cur_post = 0;
- while True:
- print "Searching posts:", cur_post,
- print "-", cur_post + PER_PAGE;
- req = urllib2.Request(URL + '/' + str(cur_post))
- try:
- response = urllib2.urlopen(req)
- except Exception as e:
- break
- body = response.read()
- result += parse_page(body);
- cur_post += PER_PAGE
- numbers = open("numbers.txt", 'w');
- for num in result:
- numbers.write("%d\n" % num)
- plt.plot(result)
- plt.ylabel('Slo-tech counting thread')
- plt.show()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement