Untitled

import urllib2
import re;
import matplotlib.pyplot as plt
from HTMLParser import HTMLParser

POST_REGEX = "<div class=\"content\">(.+?)<\/div>";
NUMBER_REGEX = "\d+"
URL = "https://slo-tech.com/forum/t158675"
PER_PAGE = 50;

class MLStripper(HTMLParser):
    def __init__(self):
        self.reset()
        self.fed = []
    def handle_data(self, d):
        self.fed.append(d)
    def get_data(self):
        return ''.join(self.fed)

def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()

def parse_page(body):
    ints = [];
    posts = re.findall(POST_REGEX, body);
    for post in posts:
        num = re.search(NUMBER_REGEX, strip_tags(post));
        if num:
            integer = int(num.group());
            if integer < 10000 and integer > -10000:
                ints.append(integer)

    return ints


result = [];
cur_post = 0;
while True:
    print "Searching posts:", cur_post,
    print "-", cur_post + PER_PAGE;
    req = urllib2.Request(URL + '/' + str(cur_post))
    try:
        response = urllib2.urlopen(req)
    except Exception as e:
        break

    body = response.read()
    result += parse_page(body);


    cur_post += PER_PAGE

numbers = open("numbers.txt", 'w');
for num in result:
  numbers.write("%d\n" % num)

plt.plot(result)
plt.ylabel('Slo-tech counting thread')
plt.show()