Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #! /usr/bin/env python
- import re
- from urllib2 import urlopen, URLError
- from urllib import urlencode
- from urlparse import urlparse, urljoin
- from httplib import HTTPConnection
- from hashlib import md5
- class DataError(Exception):
- pass
- class RequestError(Exception):
- pass
- class Crawl(object):
- # requests limit
- RLIM = 200
- # socket timeout in seconds
- TOUT = 3
- def __init__(self, url):
- """Init all objects and fill categories with links."""
- self.url = url
- self.prods = dict() # products
- self.produrls = list() # product urls from categories urls
- self.caturls = list() # categories urls
- self.reqs = 1 # requests
- # untouched lists
- self._caturls = list()
- self._produrls = list()
- # first request
- uobj = urlopen(self.url, timeout=Crawl.TOUT)
- data = uobj.read()
- uobj.close()
- catPat = r"<a class='cat_main' href='(.+?)'"
- for regex in re.finditer(catPat, data):
- link = self._abs_link(regex.group(1))
- self.caturls.append(link)
- self._caturls.append(link)
- def _incr_check(self):
- """Limit number of server requests."""
- self.reqs += 1
- if self.reqs > Crawl.RLIM:
- raise RequestError("limit reached")
- def _abs_link(self, link):
- """Get absolute URL.
- Based on HTML sources, every link isn't relative
- from current page, but from main net location.
- """
- if urlparse(link).scheme == "":
- link = urljoin(self.url, link)
- return link
- def _fill_produrls(self):
- """Populate products list with links from categories."""
- if len(self.caturls) == 0:
- raise DataError("no more data available")
- cat = self.caturls.pop(0)
- self._incr_check()
- uobj = urlopen(cat, timeout=Crawl.TOUT)
- data = uobj.read()
- uobj.close()
- prodPat = r'<div class="listProdDown"><a href="(.+?)"'
- for regex in re.finditer(prodPat, data):
- link = self._abs_link(regex.group(1))
- self.produrls.append(link)
- self._produrls.append(link)
- def _get_download(self, link):
- """Resolve redirect."""
- self._incr_check()
- # read data
- conn = HTTPConnection(urlparse(self.url).netloc, timeout=Crawl.TOUT)
- conn.request("GET", urlparse(link).path)
- resp = conn.getresponse()
- data = resp.read()
- # find the captcha
- expr = r"(\d+) \+ (\d+) ="
- regex = re.search(expr, data)
- # compute values to send
- added = str(int(regex.group(1)) + int(regex.group(2)))
- hexHash = md5(added).hexdigest()
- # fill in what we need
- params = urlencode({"confirm": "1", "result": added,
- "res_mask": hexHash, "action": "download"})
- headers = {"Content-type": "application/x-www-form-urlencoded"}
- # submit data
- conn.request("POST", urlparse(link).path, params, headers)
- resp = conn.getresponse()
- download = dict(resp.getheaders())["location"]
- conn.close()
- return download
- def _process_item(self, item):
- """Return a dict as an item from a list of details.
- item = [title, redir, platform, size]
- """
- regex = re.search(r"([a-z0-9.]+?)$", item[0])
- ver = regex.group(1) # version from title
- title = item[0][:regex.start()].strip() # title only
- # resolve redirect
- download = self._get_download(item[1])
- # return item as a dict
- return {title: [ver, download, item[2], item[3]]}
- def _get_dict_item(self, link):
- """Parse the product page and return its processed details."""
- # save page for parsing
- self._incr_check()
- uobj = urlopen(link, timeout=Crawl.TOUT)
- data = uobj.read()
- uobj.close()
- # file full title and redirect page
- tlPat = r'<span class="bold_text">(.+?)' +\
- r'</span><br><br><a href="(.+?)"'
- regex = re.search(tlPat, data)
- title, redir = regex.group(1), regex.group(2)
- redir = self._abs_link(redir)
- # platform
- platPat = r"Platform:</div>.+?>(.+?)<"
- platform = re.search(platPat, data, re.DOTALL).group(1)
- # size
- sizePat = r"File size:</div>.+?>(.+?)<"
- size = re.search(sizePat, data, re.DOTALL).group(1)
- item = [title, redir, platform, size]
- return self._process_item(item)
- def next(self):
- if len(self.produrls) == 0:
- self._fill_produrls()
- # we've got at least one product link
- # update products dictionary
- prodLink = self.produrls.pop(0)
- self.prods.update(self._get_dict_item(prodLink))
- def get_caturls(self):
- return self._caturls
- def get_produrls(self):
- return self._produrls
- def get_prods(self):
- return self.prods
- if __name__ == "__main__":
- prodnr = 50
- try:
- crawl = Crawl("http://1st-download.com")
- for i in xrange(prodnr):
- crawl.next()
- except RequestError as msg:
- print "RequestError: %s" % msg
- except DataError as msg:
- print "DataError: %s" % msg
- except URLError as msg:
- print "URLError: %s" % msg
- #print "Categories..."
- #print crawl.get_caturls()
- #print "Products..."
- #print crawl.get_produrls()
- #print ""
- print crawl.get_prods()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement