SHARE
TWEET

index.hu date range search

a guest May 17th, 2010 136 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. #!/usr/bin/python
  2. '''
  3. Created on Apr 2, 2010
  4.  
  5. @author: bpgergo@gmail.com
  6.  
  7. it is free to use this
  8. '''
  9.  
  10. from datetime import datetime, timedelta
  11. from html5lib import treebuilders, treewalkers, serializer
  12. import html5lib
  13. import re
  14. import urllib2
  15. import sys
  16.  
  17. def openURL (url):
  18.     """
  19.    utlitity function, returns (page, url)
  20.    sets user_agent and resolves possible redirection
  21.    returned url may be different than initial url in the case of a redirect
  22.    """    
  23.     request = urllib2.Request(url)
  24.     user_agent = "Mozilla/5.0 (X11; U; Linux x86_64; fr; rv:1.9.1.5) Gecko/20091109 Ubuntu/9.10 (karmic) Firefox/3.5.5"
  25.     request.add_header("User-Agent", user_agent)
  26.     pagefile=urllib2.urlopen(request)
  27.     realurl = pagefile.geturl()
  28.     return (pagefile, realurl)
  29.  
  30. def daterange(start, stop, step=timedelta(days=1), inclusive=True):
  31.     """
  32.    utility function, returns list of dates within the specified range
  33.    """
  34.     # inclusive=False to behave like range by default
  35.     if step.days > 0:
  36.         while start < stop:
  37.             yield start
  38.             start = start + step
  39.             # not +=! don't modify object passed in if it's mutable
  40.             # since this function is not restricted to
  41.             # only types from datetime module
  42.     elif step.days < 0:
  43.         while start > stop:
  44.             yield start
  45.             start = start + step
  46.     if inclusive and start == stop:
  47.         yield start
  48.  
  49. def processURLindex(url):
  50.     """
  51.    process an url of an index.hu search result page
  52.    returns number of search results
  53.    e.g. http://index.hu/24ora/?s=LMP&tol=2010-04-02&ig=2010-04-02    
  54.    """
  55.     (f, new_url) = openURL(url)
  56.     parser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("dom"))
  57.     tree = parser.parse(f)
  58.     tree.normalize()
  59.     for span in tree.getElementsByTagName("span"):            
  60.         if span.hasAttribute("class") and (span.getAttribute("class") =="talalat"):
  61.             return re.findall(r'\d+', span.firstChild.data)[0]
  62.  
  63.  
  64.  
  65. def daterange2URLindex(term, start_date, end_date):
  66.     urlpattern = "http://index.hu/24ora/?s=$TERM$&tol=2010-04-02&ig=2010-04-02"
  67.     cum = 0
  68.     for single_date in daterange(start_date, end_date):
  69.         datestr = single_date.strftime("%Y-%m-%d")
  70.         url = re.sub(r"\d\d\d\d-\d\d-\d\d", datestr, urlpattern)
  71.         url = url.replace("$TERM$", term);
  72.         num = int(processURLindex(url))
  73.         cum = cum + num
  74.         print "\t".join([str(num), str(cum), datestr, url])  
  75.  
  76.  
  77. if __name__ == '__main__':
  78.     if len(sys.argv) == 4:
  79.         start_date = datetime.strptime(sys.argv[2], '%Y-%m-%d')
  80.         end_date = datetime.strptime(sys.argv[3], '%Y-%m-%d')
  81.         daterange2URLindex(sys.argv[1], start_date, end_date)
  82.     else:
  83.         print 'usage:'
  84.         print 'index.hu.py [search term] [from date] [to date] > results.txt'
  85.         print 'the date format is yyyy-mm-dd'
  86.         print 'the output format is TAB delimited and will be the following:'
  87.         print '[count of search results]TAB[count cumulated]TAB[date]TAB[search URL for that date]'
  88.         sys.exit(-1)
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
 
Top