Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env python
- import sys
- import os
- from bs4 import BeautifulSoup
- import pycurl
- from StringIO import StringIO
- import json
- from datetime import datetime
- import re
- import multiprocessing
- from functools import partial
- EF_URL = "http://www.eurofotbal.cz/clanky/-"
- #EF_URL = "http://www.eurofotbal.cz/serie-a/reportaz/-"
- JSON_OUT_DIR = "dump-article"
- #JSON_OUT_DIR = "dump-match"
- CONTENT = "A"
- #CONTENT = "M"
- EF_FORUM_PARAM = "/?forum=1"
- DATETIME_FORMAT = "%d.%m.%Y %H:%M"
- WORKERS = 4
- def get_page(number):
- """Fetch single EF HTML page with forum and return as a string"""
- buffer = StringIO()
- c = pycurl.Curl()
- url = "%s%d%s" % (EF_URL, number, EF_FORUM_PARAM)
- c.setopt(c.URL, url)
- c.setopt(c.WRITEDATA, buffer)
- c.perform()
- if CONTENT == "M" and c.getinfo(pycurl.HTTP_CODE) in [503] and number == 115968:
- pass
- elif c.getinfo(pycurl.HTTP_CODE) not in [200, 302]:
- raise ValueError('Received code: %d for id %d' % (c.getinfo(pycurl.HTTP_CODE), number))
- c.close()
- return buffer.getvalue()
- def process_article(number):
- """Process single page"""
- filepath = "%s/%d" % (JSON_OUT_DIR, number)
- # Create file only if not exists
- if not os.path.exists(filepath):
- file = open(filepath, 'w')
- else:
- return # Skipping
- page = BeautifulSoup(get_page(number))
- data = {}
- if CONTENT == 'A':
- article = page.find('div', attrs={'class': 'article'})
- else:
- article = page.find('div', attrs={'class': 'matchmain'})
- forum = page.find('div', attrs={'class': 'forum'})
- # Page exists
- if article and forum:
- data['id'] = number
- data['name'] = article.h2.text.encode('utf8')
- #data['is_express'] = article.find('div', attrs={'class': 'text'}).text.isspace()
- data['created'] = article.find('div', attrs={'class': 'date'}).text[0:16]
- # Check date format
- try:
- datetime.strptime(data['created'], DATETIME_FORMAT)
- except ValueError:
- data['created'] = "01.01.2000 00:00"
- if CONTENT == 'A':
- data['views'] = int(page.find('div', attrs={'class': 'fr'}, text=re.compile("zobrazen")).text.rsplit(' ', 1)[1])
- else:
- data['views'] = 0
- data['forum'] = []
- indent_old = 5
- stack = []
- parent = None
- for post in forum.findAll('div', attrs={'class': 'post'}):
- comment = {}
- comment['id'] = int(post.get('id')[1:])
- # Solve parents, compute from indentations
- indent = int(post.get('style').rsplit(' ', 1)[1][:-3])
- if indent > indent_old:
- parent = stack.pop()
- stack.append(parent)
- elif indent < indent_old:
- level_diff = (indent_old - indent) / 15
- for _ in range(level_diff+2):
- if stack:
- parent = stack.pop()
- else:
- parent = None
- elif stack:
- stack.pop()
- stack.append(comment['id'])
- indent_old = indent
- comment['parent'] = parent
- comment['author'] = post.find(['a', 'div'], attrs={'class': 'name'}).text.encode('utf8')
- comment['created'] = post.find('div', attrs={'class': 'time'}).text
- # Check date format
- datetime.strptime(comment['created'], DATETIME_FORMAT)
- comment['text'] = post.find('div', attrs={'class': 'text'}).encode_contents()
- data['forum'].append(comment)
- json.dump(data, file, ensure_ascii=False, indent=2)
- file.close()
- def process_articles(top, bottom):
- """Process articles interval"""
- # Loop article ids
- for i in xrange(top, bottom-1, -WORKERS):
- process_article(i)
- print "ID %d completed, %d ID(s) remaining." % (i, i-bottom)
- if __name__ == "__main__":
- if len(sys.argv) == 3:
- bottom = int(sys.argv[2])
- top = int(sys.argv[1])
- elif len(sys.argv) == 2:
- bottom = 0
- top = int(sys.argv[1])
- else:
- sys.exit(0)
- # Create output directory
- if not os.path.exists(JSON_OUT_DIR):
- os.makedirs(JSON_OUT_DIR)
- p = multiprocessing.Pool(WORKERS)
- iterable = []
- for i in range(WORKERS):
- iterable.append(top - i)
- func = partial(process_articles, bottom=bottom)
- p.map(func, iterable)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement