Advertisement
Guest User

efdump

a guest
Aug 22nd, 2015
120
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 4.45 KB | None | 0 0
  1. #!/usr/bin/env python
  2.  
  3. import sys
  4. import os
  5. from bs4 import BeautifulSoup
  6. import pycurl
  7. from StringIO import StringIO
  8. import json
  9. from datetime import datetime
  10. import re
  11. import multiprocessing
  12. from functools import partial
  13.  
  14. EF_URL = "http://www.eurofotbal.cz/clanky/-"
  15. #EF_URL = "http://www.eurofotbal.cz/serie-a/reportaz/-"
  16. JSON_OUT_DIR = "dump-article"
  17. #JSON_OUT_DIR = "dump-match"
  18. CONTENT = "A"
  19. #CONTENT = "M"
  20. EF_FORUM_PARAM = "/?forum=1"
  21. DATETIME_FORMAT = "%d.%m.%Y %H:%M"
  22. WORKERS = 4
  23.  
  24. def get_page(number):
  25.     """Fetch single EF HTML page with forum and return as a string"""
  26.     buffer = StringIO()
  27.     c = pycurl.Curl()
  28.     url = "%s%d%s" % (EF_URL, number, EF_FORUM_PARAM)
  29.     c.setopt(c.URL, url)
  30.     c.setopt(c.WRITEDATA, buffer)
  31.     c.perform()
  32.     if CONTENT == "M" and c.getinfo(pycurl.HTTP_CODE) in [503] and number == 115968:
  33.         pass
  34.     elif c.getinfo(pycurl.HTTP_CODE) not in [200, 302]:
  35.         raise ValueError('Received code: %d for id %d' % (c.getinfo(pycurl.HTTP_CODE), number))
  36.     c.close()
  37.     return buffer.getvalue()
  38.  
  39. def process_article(number):
  40.     """Process single page"""
  41.     filepath = "%s/%d" % (JSON_OUT_DIR, number)
  42.     # Create file only if not exists
  43.     if not os.path.exists(filepath):
  44.         file = open(filepath, 'w')
  45.     else:
  46.         return # Skipping
  47.  
  48.     page = BeautifulSoup(get_page(number))
  49.     data = {}
  50.     if CONTENT == 'A':
  51.         article = page.find('div', attrs={'class': 'article'})
  52.     else:
  53.         article = page.find('div', attrs={'class': 'matchmain'})
  54.     forum = page.find('div', attrs={'class': 'forum'})
  55.     # Page exists
  56.     if article and forum:
  57.         data['id'] = number
  58.         data['name'] = article.h2.text.encode('utf8')
  59.         #data['is_express'] = article.find('div', attrs={'class': 'text'}).text.isspace()
  60.         data['created'] = article.find('div', attrs={'class': 'date'}).text[0:16]
  61.         # Check date format
  62.         try:
  63.             datetime.strptime(data['created'], DATETIME_FORMAT)
  64.         except ValueError:
  65.             data['created'] = "01.01.2000 00:00"
  66.         if CONTENT == 'A':
  67.             data['views'] = int(page.find('div', attrs={'class': 'fr'}, text=re.compile("zobrazen")).text.rsplit(' ', 1)[1])
  68.         else:
  69.             data['views'] = 0
  70.  
  71.         data['forum'] = []
  72.         indent_old = 5
  73.         stack = []
  74.         parent = None
  75.         for post in forum.findAll('div', attrs={'class': 'post'}):
  76.             comment = {}
  77.             comment['id'] = int(post.get('id')[1:])
  78.  
  79.             # Solve parents, compute from indentations
  80.             indent = int(post.get('style').rsplit(' ', 1)[1][:-3])
  81.             if indent > indent_old:
  82.                 parent = stack.pop()
  83.                 stack.append(parent)
  84.             elif indent < indent_old:
  85.                 level_diff = (indent_old - indent) / 15
  86.                 for _ in range(level_diff+2):
  87.                     if stack:
  88.                         parent = stack.pop()
  89.                     else:
  90.                         parent = None
  91.             elif stack:
  92.                 stack.pop()
  93.             stack.append(comment['id'])
  94.             indent_old = indent
  95.             comment['parent'] = parent
  96.  
  97.             comment['author'] = post.find(['a', 'div'], attrs={'class': 'name'}).text.encode('utf8')
  98.             comment['created'] = post.find('div', attrs={'class': 'time'}).text
  99.             # Check date format
  100.             datetime.strptime(comment['created'], DATETIME_FORMAT)
  101.             comment['text'] = post.find('div', attrs={'class': 'text'}).encode_contents()
  102.  
  103.             data['forum'].append(comment)
  104.  
  105.     json.dump(data, file, ensure_ascii=False, indent=2)
  106.     file.close()
  107.  
  108. def process_articles(top, bottom):
  109.     """Process articles interval"""
  110.     # Loop article ids
  111.     for i in xrange(top, bottom-1, -WORKERS):
  112.         process_article(i)
  113.         print "ID %d completed, %d ID(s) remaining." % (i, i-bottom)
  114.  
  115. if __name__ == "__main__":
  116.     if len(sys.argv) == 3:
  117.         bottom = int(sys.argv[2])
  118.         top = int(sys.argv[1])
  119.     elif len(sys.argv) == 2:
  120.         bottom = 0
  121.         top = int(sys.argv[1])
  122.     else:
  123.         sys.exit(0)
  124.  
  125.     # Create output directory
  126.     if not os.path.exists(JSON_OUT_DIR):
  127.         os.makedirs(JSON_OUT_DIR)
  128.  
  129.     p = multiprocessing.Pool(WORKERS)
  130.     iterable = []
  131.     for i in range(WORKERS):
  132.         iterable.append(top - i)
  133.  
  134.     func = partial(process_articles, bottom=bottom)
  135.     p.map(func, iterable)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement