Guest User

index.hu date range search

a guest
May 17th, 2010
291
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 3.10 KB | None | 0 0
  1. #!/usr/bin/python
  2. '''
  3. Created on Apr 2, 2010
  4.  
  5.  
  6. it is free to use this
  7. '''
  8.  
  9. from datetime import datetime, timedelta
  10. from html5lib import treebuilders, treewalkers, serializer
  11. import html5lib
  12. import re
  13. import urllib2
  14. import sys
  15.  
  16. def openURL (url):
  17.     """
  18.    utlitity function, returns (page, url)
  19.    sets user_agent and resolves possible redirection
  20.    returned url may be different than initial url in the case of a redirect
  21.    """    
  22.     request = urllib2.Request(url)
  23.     user_agent = "Mozilla/5.0 (X11; U; Linux x86_64; fr; rv:1.9.1.5) Gecko/20091109 Ubuntu/9.10 (karmic) Firefox/3.5.5"
  24.     request.add_header("User-Agent", user_agent)
  25.     pagefile=urllib2.urlopen(request)
  26.     realurl = pagefile.geturl()
  27.     return (pagefile, realurl)
  28.  
  29. def daterange(start, stop, step=timedelta(days=1), inclusive=True):
  30.     """
  31.    utility function, returns list of dates within the specified range
  32.    """
  33.     # inclusive=False to behave like range by default
  34.     if step.days > 0:
  35.         while start < stop:
  36.             yield start
  37.             start = start + step
  38.             # not +=! don't modify object passed in if it's mutable
  39.             # since this function is not restricted to
  40.             # only types from datetime module
  41.     elif step.days < 0:
  42.         while start > stop:
  43.             yield start
  44.             start = start + step
  45.     if inclusive and start == stop:
  46.         yield start
  47.  
  48. def processURLindex(url):
  49.     """
  50.    process an url of an index.hu search result page
  51.    returns number of search results
  52.    e.g. http://index.hu/24ora/?s=LMP&tol=2010-04-02&ig=2010-04-02    
  53.    """
  54.     (f, new_url) = openURL(url)
  55.     parser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("dom"))
  56.     tree = parser.parse(f)
  57.     tree.normalize()
  58.     for span in tree.getElementsByTagName("span"):            
  59.         if span.hasAttribute("class") and (span.getAttribute("class") =="talalat"):
  60.             return re.findall(r'\d+', span.firstChild.data)[0]
  61.  
  62.  
  63.  
  64. def daterange2URLindex(term, start_date, end_date):
  65.     urlpattern = "http://index.hu/24ora/?s=$TERM$&tol=2010-04-02&ig=2010-04-02"
  66.     cum = 0
  67.     for single_date in daterange(start_date, end_date):
  68.         datestr = single_date.strftime("%Y-%m-%d")
  69.         url = re.sub(r"\d\d\d\d-\d\d-\d\d", datestr, urlpattern)
  70.         url = url.replace("$TERM$", term);
  71.         num = int(processURLindex(url))
  72.         cum = cum + num
  73.         print "\t".join([str(num), str(cum), datestr, url])  
  74.  
  75.  
  76. if __name__ == '__main__':
  77.     if len(sys.argv) == 4:
  78.         start_date = datetime.strptime(sys.argv[2], '%Y-%m-%d')
  79.         end_date = datetime.strptime(sys.argv[3], '%Y-%m-%d')
  80.         daterange2URLindex(sys.argv[1], start_date, end_date)
  81.     else:
  82.         print 'usage:'
  83.         print 'index.hu.py [search term] [from date] [to date] > results.txt'
  84.         print 'the date format is yyyy-mm-dd'
  85.         print 'the output format is TAB delimited and will be the following:'
  86.         print '[count of search results]TAB[count cumulated]TAB[date]TAB[search URL for that date]'
  87.         sys.exit(-1)
Advertisement
Add Comment
Please, Sign In to add comment