#!/usr/bin/python
\'\'\'
Created on Apr 2, 2010
@author: bpgergo@gmail.com
it is free to use this
\'\'\'
from datetime import datetime, timedelta
from html5lib import treebuilders, treewalkers, serializer
import html5lib
import re
import urllib2
import sys
def openURL (url):
"""
utlitity function, returns (page, url)
sets user_agent and resolves possible redirection
returned url may be different than initial url in the case of a redirect
"""
request = urllib2.Request(url)
user_agent = "Mozilla/5.0 (X11; U; Linux x86_64; fr; rv:1.9.1.5) Gecko/20091109 Ubuntu/9.10 (karmic) Firefox/3.5.5"
request.add_header("User-Agent", user_agent)
pagefile=urllib2.urlopen(request)
realurl = pagefile.geturl()
return (pagefile, realurl)
def daterange(start, stop, step=timedelta(days=1), inclusive=True):
"""
utility function, returns list of dates within the specified range
"""
# inclusive=False to behave like range by default
if step.days > 0:
while start < stop:
yield start
start = start + step
# not +=! don\'t modify object passed in if it\'s mutable
# since this function is not restricted to
# only types from datetime module
elif step.days < 0:
while start > stop:
yield start
start = start + step
if inclusive and start == stop:
yield start
def processURLindex(url):
"""
process an url of an index.hu search result page
returns number of search results
e.g. http://index.hu/24ora/?s=LMP&tol=2010-04-02&ig=2010-04-02
"""
(f, new_url) = openURL(url)
parser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("dom"))
tree = parser.parse(f)
tree.normalize()
for span in tree.getElementsByTagName("span"):
if span.hasAttribute("class") and (span.getAttribute("class") =="talalat"):
return re.findall(r\'\\d+\', span.firstChild.data)[0]
def daterange2URLindex(term, start_date, end_date):
urlpattern = "http://index.hu/24ora/?s=$TERM$&tol=2010-04-02&ig=2010-04-02"
cum = 0
for single_date in daterange(start_date, end_date):
datestr = single_date.strftime("%Y-%m-%d")
url = re.sub(r"\\d\\d\\d\\d-\\d\\d-\\d\\d", datestr, urlpattern)
url = url.replace("$TERM$", term);
num = int(processURLindex(url))
cum = cum + num
print "\\t".join([str(num), str(cum), datestr, url])
if __name__ == \'__main__\':
if len(sys.argv) == 4:
start_date = datetime.strptime(sys.argv[2], \'%Y-%m-%d\')
end_date = datetime.strptime(sys.argv[3], \'%Y-%m-%d\')
daterange2URLindex(sys.argv[1], start_date, end_date)
else:
print \'usage:\'
print \'index.hu.py [search term] [from date] [to date] > results.txt\'
print \'the date format is yyyy-mm-dd\'
print \'the output format is TAB delimited and will be the following:\'
print \'[count of search results]TAB[count cumulated]TAB[date]TAB[search URL for that date]\'
sys.exit(-1)