Advertisement
Guest User

index.hu date range search

a guest
May 17th, 2010
214
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 3.10 KB | None | 0 0
  1. #!/usr/bin/python
  2. '''
  3. Created on Apr 2, 2010
  4.  
  5. @author: bpgergo@gmail.com
  6.  
  7. it is free to use this
  8. '''
  9.  
  10. from datetime import datetime, timedelta
  11. from html5lib import treebuilders, treewalkers, serializer
  12. import html5lib
  13. import re
  14. import urllib2
  15. import sys
  16.  
  17. def openURL (url):
  18.     """
  19.    utlitity function, returns (page, url)
  20.    sets user_agent and resolves possible redirection
  21.    returned url may be different than initial url in the case of a redirect
  22.    """    
  23.     request = urllib2.Request(url)
  24.     user_agent = "Mozilla/5.0 (X11; U; Linux x86_64; fr; rv:1.9.1.5) Gecko/20091109 Ubuntu/9.10 (karmic) Firefox/3.5.5"
  25.     request.add_header("User-Agent", user_agent)
  26.     pagefile=urllib2.urlopen(request)
  27.     realurl = pagefile.geturl()
  28.     return (pagefile, realurl)
  29.  
  30. def daterange(start, stop, step=timedelta(days=1), inclusive=True):
  31.     """
  32.    utility function, returns list of dates within the specified range
  33.    """
  34.     # inclusive=False to behave like range by default
  35.     if step.days > 0:
  36.         while start < stop:
  37.             yield start
  38.             start = start + step
  39.             # not +=! don't modify object passed in if it's mutable
  40.             # since this function is not restricted to
  41.             # only types from datetime module
  42.     elif step.days < 0:
  43.         while start > stop:
  44.             yield start
  45.             start = start + step
  46.     if inclusive and start == stop:
  47.         yield start
  48.  
  49. def processURLindex(url):
  50.     """
  51.    process an url of an index.hu search result page
  52.    returns number of search results
  53.    e.g. http://index.hu/24ora/?s=LMP&tol=2010-04-02&ig=2010-04-02    
  54.    """
  55.     (f, new_url) = openURL(url)
  56.     parser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("dom"))
  57.     tree = parser.parse(f)
  58.     tree.normalize()
  59.     for span in tree.getElementsByTagName("span"):            
  60.         if span.hasAttribute("class") and (span.getAttribute("class") =="talalat"):
  61.             return re.findall(r'\d+', span.firstChild.data)[0]
  62.  
  63.  
  64.  
  65. def daterange2URLindex(term, start_date, end_date):
  66.     urlpattern = "http://index.hu/24ora/?s=$TERM$&tol=2010-04-02&ig=2010-04-02"
  67.     cum = 0
  68.     for single_date in daterange(start_date, end_date):
  69.         datestr = single_date.strftime("%Y-%m-%d")
  70.         url = re.sub(r"\d\d\d\d-\d\d-\d\d", datestr, urlpattern)
  71.         url = url.replace("$TERM$", term);
  72.         num = int(processURLindex(url))
  73.         cum = cum + num
  74.         print "\t".join([str(num), str(cum), datestr, url])  
  75.  
  76.  
  77. if __name__ == '__main__':
  78.     if len(sys.argv) == 4:
  79.         start_date = datetime.strptime(sys.argv[2], '%Y-%m-%d')
  80.         end_date = datetime.strptime(sys.argv[3], '%Y-%m-%d')
  81.         daterange2URLindex(sys.argv[1], start_date, end_date)
  82.     else:
  83.         print 'usage:'
  84.         print 'index.hu.py [search term] [from date] [to date] > results.txt'
  85.         print 'the date format is yyyy-mm-dd'
  86.         print 'the output format is TAB delimited and will be the following:'
  87.         print '[count of search results]TAB[count cumulated]TAB[date]TAB[search URL for that date]'
  88.         sys.exit(-1)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement