Advertisement
cmiN

prodcrawl

Feb 9th, 2013
122
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 5.61 KB | None | 0 0
  1. #! /usr/bin/env python
  2.  
  3.  
  4. import re
  5. from urllib2 import urlopen, URLError
  6. from urllib import urlencode
  7. from urlparse import urlparse, urljoin
  8. from httplib import HTTPConnection
  9. from hashlib import md5
  10.  
  11.  
  12. class DataError(Exception):
  13.     pass
  14.    
  15.  
  16. class RequestError(Exception):
  17.     pass
  18.  
  19.  
  20. class Crawl(object):
  21.  
  22.     # requests limit
  23.     RLIM = 200
  24.     # socket timeout in seconds  
  25.     TOUT = 3  
  26.  
  27.     def __init__(self, url):
  28.         """Init all objects and fill categories with links."""
  29.         self.url = url
  30.         self.prods = dict()       # products
  31.         self.produrls = list()    # product urls from categories urls
  32.         self.caturls = list()     # categories urls
  33.         self.reqs = 1             # requests
  34.         # untouched lists
  35.         self._caturls = list()
  36.         self._produrls = list()
  37.         # first request
  38.         uobj = urlopen(self.url, timeout=Crawl.TOUT)
  39.         data = uobj.read()
  40.         uobj.close()
  41.         catPat = r"<a class='cat_main' href='(.+?)'"
  42.         for regex in re.finditer(catPat, data):
  43.             link = self._abs_link(regex.group(1))
  44.             self.caturls.append(link)
  45.             self._caturls.append(link)
  46.  
  47.     def _incr_check(self):
  48.         """Limit number of server requests."""
  49.         self.reqs += 1
  50.         if self.reqs > Crawl.RLIM:
  51.             raise RequestError("limit reached")
  52.  
  53.     def _abs_link(self, link):
  54.         """Get absolute URL.
  55.        
  56.        Based on HTML sources, every link isn't relative
  57.        from current page, but from main net location.
  58.        """
  59.         if urlparse(link).scheme == "":
  60.             link = urljoin(self.url, link)
  61.         return link
  62.  
  63.     def _fill_produrls(self):
  64.         """Populate products list with links from categories."""
  65.         if len(self.caturls) == 0:
  66.             raise DataError("no more data available")
  67.         cat = self.caturls.pop(0)
  68.         self._incr_check()
  69.         uobj = urlopen(cat, timeout=Crawl.TOUT)
  70.         data = uobj.read()
  71.         uobj.close()
  72.         prodPat = r'<div class="listProdDown"><a href="(.+?)"'
  73.         for regex in re.finditer(prodPat, data):
  74.             link = self._abs_link(regex.group(1))
  75.             self.produrls.append(link)
  76.             self._produrls.append(link)
  77.  
  78.     def _get_download(self, link):
  79.         """Resolve redirect."""
  80.         self._incr_check()
  81.         # read data
  82.         conn = HTTPConnection(urlparse(self.url).netloc, timeout=Crawl.TOUT)
  83.         conn.request("GET", urlparse(link).path)
  84.         resp = conn.getresponse()
  85.         data = resp.read()
  86.         # find the captcha
  87.         expr = r"(\d+) \+ (\d+) ="
  88.         regex = re.search(expr, data)
  89.         # compute values to send
  90.         added = str(int(regex.group(1)) + int(regex.group(2)))
  91.         hexHash = md5(added).hexdigest()
  92.         # fill in what we need
  93.         params = urlencode({"confirm": "1", "result": added,
  94.                             "res_mask": hexHash, "action": "download"})
  95.         headers = {"Content-type": "application/x-www-form-urlencoded"}
  96.         # submit data
  97.         conn.request("POST", urlparse(link).path, params, headers)
  98.         resp = conn.getresponse()
  99.         download = dict(resp.getheaders())["location"]
  100.         conn.close()
  101.         return download
  102.  
  103.     def _process_item(self, item):
  104.         """Return a dict as an item from a list of details.
  105.  
  106.        item = [title, redir, platform, size]
  107.        """
  108.         regex = re.search(r"([a-z0-9.]+?)$", item[0])
  109.         ver = regex.group(1)                       # version from title
  110.         title = item[0][:regex.start()].strip()    # title only
  111.         # resolve redirect
  112.         download = self._get_download(item[1])
  113.         # return item as a dict
  114.         return {title: [ver, download, item[2], item[3]]}
  115.  
  116.     def _get_dict_item(self, link):
  117.         """Parse the product page and return its processed details."""
  118.         # save page for parsing
  119.         self._incr_check()
  120.         uobj = urlopen(link, timeout=Crawl.TOUT)
  121.         data = uobj.read()
  122.         uobj.close()
  123.         # file full title and redirect page
  124.         tlPat = r'<span class="bold_text">(.+?)' +\
  125.                 r'</span><br><br><a href="(.+?)"'
  126.         regex = re.search(tlPat, data)
  127.         title, redir = regex.group(1), regex.group(2)
  128.         redir = self._abs_link(redir)
  129.         # platform
  130.         platPat = r"Platform:</div>.+?>(.+?)<"
  131.         platform = re.search(platPat, data, re.DOTALL).group(1)
  132.         # size
  133.         sizePat = r"File size:</div>.+?>(.+?)<"
  134.         size = re.search(sizePat, data, re.DOTALL).group(1)
  135.         item = [title, redir, platform, size]
  136.         return self._process_item(item)
  137.  
  138.     def next(self):
  139.         if len(self.produrls) == 0:
  140.             self._fill_produrls()
  141.         # we've got at least one product link
  142.         # update products dictionary
  143.         prodLink = self.produrls.pop(0)
  144.         self.prods.update(self._get_dict_item(prodLink))
  145.  
  146.     def get_caturls(self):
  147.         return self._caturls
  148.  
  149.     def get_produrls(self):
  150.         return self._produrls
  151.  
  152.     def get_prods(self):
  153.         return self.prods
  154.  
  155.  
  156. if __name__ == "__main__":
  157.     prodnr = 50
  158.     try:
  159.         crawl = Crawl("http://1st-download.com")
  160.         for i in xrange(prodnr):
  161.             crawl.next()
  162.     except RequestError as msg:
  163.         print "RequestError: %s" % msg
  164.     except DataError as msg:
  165.         print "DataError: %s" % msg
  166.     except URLError as msg:
  167.         print "URLError: %s" % msg
  168.  
  169.     #print "Categories..."
  170.     #print crawl.get_caturls()
  171.     #print "Products..."
  172.     #print crawl.get_produrls()
  173.     #print ""
  174.     print crawl.get_prods()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement