Advertisement
Guest User

Untitled

a guest
Sep 16th, 2015
118
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.28 KB | None | 0 0
  1. import urllib2
  2. import re;
  3. import matplotlib.pyplot as plt
  4. from HTMLParser import HTMLParser
  5.  
  6. POST_REGEX = "<div class=\"content\">(.+?)<\/div>";
  7. NUMBER_REGEX = "\d+"
  8. URL = "https://slo-tech.com/forum/t158675"
  9. PER_PAGE = 50;
  10.  
  11. class MLStripper(HTMLParser):
  12.     def __init__(self):
  13.         self.reset()
  14.         self.fed = []
  15.     def handle_data(self, d):
  16.         self.fed.append(d)
  17.     def get_data(self):
  18.         return ''.join(self.fed)
  19.  
  20. def strip_tags(html):
  21.     s = MLStripper()
  22.     s.feed(html)
  23.     return s.get_data()
  24.  
  25. def parse_page(body):
  26.     ints = [];
  27.     posts = re.findall(POST_REGEX, body);
  28.     for post in posts:
  29.         num = re.search(NUMBER_REGEX, strip_tags(post));
  30.         if num:
  31.             integer = int(num.group());
  32.             if integer < 10000 and integer > -10000:
  33.                 ints.append(integer)
  34.  
  35.     return ints
  36.  
  37.  
  38. result = [];
  39. cur_post = 0;
  40. while True:
  41.     print "Searching posts:", cur_post,
  42.     print "-", cur_post + PER_PAGE;
  43.     req = urllib2.Request(URL + '/' + str(cur_post))
  44.     try:
  45.         response = urllib2.urlopen(req)
  46.     except Exception as e:
  47.         break
  48.  
  49.     body = response.read()
  50.     result += parse_page(body);
  51.  
  52.  
  53.     cur_post += PER_PAGE
  54.  
  55. numbers = open("numbers.txt", 'w');
  56. for num in result:
  57.   numbers.write("%d\n" % num)
  58.  
  59. plt.plot(result)
  60. plt.ylabel('Slo-tech counting thread')
  61. plt.show()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement