index.hu date range search

#!/usr/bin/python
'''
Created on Apr 2, 2010

@author: bpgergo@gmail.com

it is free to use this
'''

from datetime import datetime, timedelta
from html5lib import treebuilders, treewalkers, serializer
import html5lib
import re
import urllib2
import sys

def openURL (url):
    """
    utlitity function, returns (page, url)
    sets user_agent and resolves possible redirection
    returned url may be different than initial url in the case of a redirect
    """
    request = urllib2.Request(url)
    user_agent = "Mozilla/5.0 (X11; U; Linux x86_64; fr; rv:1.9.1.5) Gecko/20091109 Ubuntu/9.10 (karmic) Firefox/3.5.5"
    request.add_header("User-Agent", user_agent)
    pagefile=urllib2.urlopen(request)
    realurl = pagefile.geturl()
    return (pagefile, realurl)

def daterange(start, stop, step=timedelta(days=1), inclusive=True):
    """
    utility function, returns list of dates within the specified range
    """
    # inclusive=False to behave like range by default
    if step.days > 0:
        while start < stop:
            yield start
            start = start + step
            # not +=! don't modify object passed in if it's mutable
            # since this function is not restricted to
            # only types from datetime module
    elif step.days < 0:
        while start > stop:
            yield start
            start = start + step
    if inclusive and start == stop:
        yield start

def processURLindex(url):
    """
    process an url of an index.hu search result page
    returns number of search results
    e.g. http://index.hu/24ora/?s=LMP&tol=2010-04-02&ig=2010-04-02
    """
    (f, new_url) = openURL(url)
    parser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("dom"))
    tree = parser.parse(f)
    tree.normalize()
    for span in tree.getElementsByTagName("span"):
        if span.hasAttribute("class") and (span.getAttribute("class") =="talalat"):
            return re.findall(r'\d+', span.firstChild.data)[0]


def daterange2URLindex(term, start_date, end_date):
    urlpattern = "http://index.hu/24ora/?s=$TERM$&tol=2010-04-02&ig=2010-04-02"
    cum = 0
    for single_date in daterange(start_date, end_date):
        datestr = single_date.strftime("%Y-%m-%d")
        url = re.sub(r"\d\d\d\d-\d\d-\d\d", datestr, urlpattern)
        url = url.replace("$TERM$", term);
        num = int(processURLindex(url))
        cum = cum + num
        print "\t".join([str(num), str(cum), datestr, url])


if __name__ == '__main__':
    if len(sys.argv) == 4:
        start_date = datetime.strptime(sys.argv[2], '%Y-%m-%d')
        end_date = datetime.strptime(sys.argv[3], '%Y-%m-%d')
        daterange2URLindex(sys.argv[1], start_date, end_date)
    else:
        print 'usage:'
        print 'index.hu.py [search term] [from date] [to date] > results.txt'
        print 'the date format is yyyy-mm-dd'
        print 'the output format is TAB delimited and will be the following:'
        print '[count of search results]TAB[count cumulated]TAB[date]TAB[search URL for that date]'
        sys.exit(-1)