efdump

#!/usr/bin/env python

import sys
import os
from bs4 import BeautifulSoup
import pycurl
from StringIO import StringIO
import json
from datetime import datetime
import re
import multiprocessing
from functools import partial

EF_URL = "http://www.eurofotbal.cz/clanky/-"
#EF_URL = "http://www.eurofotbal.cz/serie-a/reportaz/-"
JSON_OUT_DIR = "dump-article"
#JSON_OUT_DIR = "dump-match"
CONTENT = "A"
#CONTENT = "M"
EF_FORUM_PARAM = "/?forum=1"
DATETIME_FORMAT = "%d.%m.%Y %H:%M"
WORKERS = 4

def get_page(number):
    """Fetch single EF HTML page with forum and return as a string"""
    buffer = StringIO()
    c = pycurl.Curl()
    url = "%s%d%s" % (EF_URL, number, EF_FORUM_PARAM)
    c.setopt(c.URL, url)
    c.setopt(c.WRITEDATA, buffer)
    c.perform()
    if CONTENT == "M" and c.getinfo(pycurl.HTTP_CODE) in [503] and number == 115968:
        pass
    elif c.getinfo(pycurl.HTTP_CODE) not in [200, 302]:
        raise ValueError('Received code: %d for id %d' % (c.getinfo(pycurl.HTTP_CODE), number))
    c.close()
    return buffer.getvalue()

def process_article(number):
    """Process single page"""
    filepath = "%s/%d" % (JSON_OUT_DIR, number)
    # Create file only if not exists
    if not os.path.exists(filepath):
        file = open(filepath, 'w')
    else:
        return # Skipping

    page = BeautifulSoup(get_page(number))
    data = {}
    if CONTENT == 'A':
        article = page.find('div', attrs={'class': 'article'})
    else:
        article = page.find('div', attrs={'class': 'matchmain'})
    forum = page.find('div', attrs={'class': 'forum'})
    # Page exists
    if article and forum:
        data['id'] = number
        data['name'] = article.h2.text.encode('utf8')
        #data['is_express'] = article.find('div', attrs={'class': 'text'}).text.isspace()
        data['created'] = article.find('div', attrs={'class': 'date'}).text[0:16]
        # Check date format
        try:
            datetime.strptime(data['created'], DATETIME_FORMAT)
        except ValueError:
            data['created'] = "01.01.2000 00:00"
        if CONTENT == 'A':
            data['views'] = int(page.find('div', attrs={'class': 'fr'}, text=re.compile("zobrazen")).text.rsplit(' ', 1)[1])
        else:
            data['views'] = 0

        data['forum'] = []
        indent_old = 5
        stack = []
        parent = None
        for post in forum.findAll('div', attrs={'class': 'post'}):
            comment = {}
            comment['id'] = int(post.get('id')[1:])

            # Solve parents, compute from indentations
            indent = int(post.get('style').rsplit(' ', 1)[1][:-3])
            if indent > indent_old:
                parent = stack.pop()
                stack.append(parent)
            elif indent < indent_old:
                level_diff = (indent_old - indent) / 15
                for _ in range(level_diff+2):
                    if stack:
                        parent = stack.pop()
                    else:
                        parent = None
            elif stack:
                stack.pop()
            stack.append(comment['id'])
            indent_old = indent
            comment['parent'] = parent

            comment['author'] = post.find(['a', 'div'], attrs={'class': 'name'}).text.encode('utf8')
            comment['created'] = post.find('div', attrs={'class': 'time'}).text
            # Check date format
            datetime.strptime(comment['created'], DATETIME_FORMAT)
            comment['text'] = post.find('div', attrs={'class': 'text'}).encode_contents()

            data['forum'].append(comment)

    json.dump(data, file, ensure_ascii=False, indent=2)
    file.close()

def process_articles(top, bottom):
    """Process articles interval"""
    # Loop article ids
    for i in xrange(top, bottom-1, -WORKERS):
        process_article(i)
        print "ID %d completed, %d ID(s) remaining." % (i, i-bottom)

if __name__ == "__main__":
    if len(sys.argv) == 3:
        bottom = int(sys.argv[2])
        top = int(sys.argv[1])
    elif len(sys.argv) == 2:
        bottom = 0
        top = int(sys.argv[1])
    else:
        sys.exit(0)

    # Create output directory
    if not os.path.exists(JSON_OUT_DIR):
        os.makedirs(JSON_OUT_DIR)

    p = multiprocessing.Pool(WORKERS)
    iterable = []
    for i in range(WORKERS):
        iterable.append(top - i)

    func = partial(process_articles, bottom=bottom)
    p.map(func, iterable)