Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/python
- '''
- Created on Apr 2, 2010
- @author: bpgergo@gmail.com
- it is free to use this
- '''
- from datetime import datetime, timedelta
- from html5lib import treebuilders, treewalkers, serializer
- import html5lib
- import re
- import urllib2
- import sys
- def openURL (url):
- """
- utlitity function, returns (page, url)
- sets user_agent and resolves possible redirection
- returned url may be different than initial url in the case of a redirect
- """
- request = urllib2.Request(url)
- user_agent = "Mozilla/5.0 (X11; U; Linux x86_64; fr; rv:1.9.1.5) Gecko/20091109 Ubuntu/9.10 (karmic) Firefox/3.5.5"
- request.add_header("User-Agent", user_agent)
- pagefile=urllib2.urlopen(request)
- realurl = pagefile.geturl()
- return (pagefile, realurl)
- def daterange(start, stop, step=timedelta(days=1), inclusive=True):
- """
- utility function, returns list of dates within the specified range
- """
- # inclusive=False to behave like range by default
- if step.days > 0:
- while start < stop:
- yield start
- start = start + step
- # not +=! don't modify object passed in if it's mutable
- # since this function is not restricted to
- # only types from datetime module
- elif step.days < 0:
- while start > stop:
- yield start
- start = start + step
- if inclusive and start == stop:
- yield start
- def processURLindex(url):
- """
- process an url of an index.hu search result page
- returns number of search results
- e.g. http://index.hu/24ora/?s=LMP&tol=2010-04-02&ig=2010-04-02
- """
- (f, new_url) = openURL(url)
- parser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("dom"))
- tree = parser.parse(f)
- tree.normalize()
- for span in tree.getElementsByTagName("span"):
- if span.hasAttribute("class") and (span.getAttribute("class") =="talalat"):
- return re.findall(r'\d+', span.firstChild.data)[0]
- def daterange2URLindex(term, start_date, end_date):
- urlpattern = "http://index.hu/24ora/?s=$TERM$&tol=2010-04-02&ig=2010-04-02"
- cum = 0
- for single_date in daterange(start_date, end_date):
- datestr = single_date.strftime("%Y-%m-%d")
- url = re.sub(r"\d\d\d\d-\d\d-\d\d", datestr, urlpattern)
- url = url.replace("$TERM$", term);
- num = int(processURLindex(url))
- cum = cum + num
- print "\t".join([str(num), str(cum), datestr, url])
- if __name__ == '__main__':
- if len(sys.argv) == 4:
- start_date = datetime.strptime(sys.argv[2], '%Y-%m-%d')
- end_date = datetime.strptime(sys.argv[3], '%Y-%m-%d')
- daterange2URLindex(sys.argv[1], start_date, end_date)
- else:
- print 'usage:'
- print 'index.hu.py [search term] [from date] [to date] > results.txt'
- print 'the date format is yyyy-mm-dd'
- print 'the output format is TAB delimited and will be the following:'
- print '[count of search results]TAB[count cumulated]TAB[date]TAB[search URL for that date]'
- sys.exit(-1)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement