prodcrawl

#! /usr/bin/env python


import re
from urllib2 import urlopen, URLError
from urllib import urlencode
from urlparse import urlparse, urljoin
from httplib import HTTPConnection
from hashlib import md5


class DataError(Exception):
    pass


class RequestError(Exception):
    pass


class Crawl(object):

    # requests limit
    RLIM = 200
    # socket timeout in seconds
    TOUT = 3

    def __init__(self, url):
        """Init all objects and fill categories with links."""
        self.url = url
        self.prods = dict()       # products
        self.produrls = list()    # product urls from categories urls
        self.caturls = list()     # categories urls
        self.reqs = 1             # requests
        # untouched lists
        self._caturls = list()
        self._produrls = list()
        # first request
        uobj = urlopen(self.url, timeout=Crawl.TOUT)
        data = uobj.read()
        uobj.close()
        catPat = r"<a class='cat_main' href='(.+?)'"
        for regex in re.finditer(catPat, data):
            link = self._abs_link(regex.group(1))
            self.caturls.append(link)
            self._caturls.append(link)

    def _incr_check(self):
        """Limit number of server requests."""
        self.reqs += 1
        if self.reqs > Crawl.RLIM:
            raise RequestError("limit reached")

    def _abs_link(self, link):
        """Get absolute URL.

        Based on HTML sources, every link isn't relative
        from current page, but from main net location.
        """
        if urlparse(link).scheme == "":
            link = urljoin(self.url, link)
        return link

    def _fill_produrls(self):
        """Populate products list with links from categories."""
        if len(self.caturls) == 0:
            raise DataError("no more data available")
        cat = self.caturls.pop(0)
        self._incr_check()
        uobj = urlopen(cat, timeout=Crawl.TOUT)
        data = uobj.read()
        uobj.close()
        prodPat = r'<div class="listProdDown"><a href="(.+?)"'
        for regex in re.finditer(prodPat, data):
            link = self._abs_link(regex.group(1))
            self.produrls.append(link)
            self._produrls.append(link)

    def _get_download(self, link):
        """Resolve redirect."""
        self._incr_check()
        # read data
        conn = HTTPConnection(urlparse(self.url).netloc, timeout=Crawl.TOUT)
        conn.request("GET", urlparse(link).path)
        resp = conn.getresponse()
        data = resp.read()
        # find the captcha
        expr = r"(\d+) \+ (\d+) ="
        regex = re.search(expr, data)
        # compute values to send
        added = str(int(regex.group(1)) + int(regex.group(2)))
        hexHash = md5(added).hexdigest()
        # fill in what we need
        params = urlencode({"confirm": "1", "result": added,
                            "res_mask": hexHash, "action": "download"})
        headers = {"Content-type": "application/x-www-form-urlencoded"}
        # submit data
        conn.request("POST", urlparse(link).path, params, headers)
        resp = conn.getresponse()
        download = dict(resp.getheaders())["location"]
        conn.close()
        return download

    def _process_item(self, item):
        """Return a dict as an item from a list of details.

        item = [title, redir, platform, size]
        """
        regex = re.search(r"([a-z0-9.]+?)$", item[0])
        ver = regex.group(1)                       # version from title
        title = item[0][:regex.start()].strip()    # title only
        # resolve redirect
        download = self._get_download(item[1])
        # return item as a dict
        return {title: [ver, download, item[2], item[3]]}

    def _get_dict_item(self, link):
        """Parse the product page and return its processed details."""
        # save page for parsing
        self._incr_check()
        uobj = urlopen(link, timeout=Crawl.TOUT)
        data = uobj.read()
        uobj.close()
        # file full title and redirect page
        tlPat = r'<span class="bold_text">(.+?)' +\
                r'</span><br><br><a href="(.+?)"'
        regex = re.search(tlPat, data)
        title, redir = regex.group(1), regex.group(2)
        redir = self._abs_link(redir)
        # platform
        platPat = r"Platform:</div>.+?>(.+?)<"
        platform = re.search(platPat, data, re.DOTALL).group(1)
        # size
        sizePat = r"File size:</div>.+?>(.+?)<"
        size = re.search(sizePat, data, re.DOTALL).group(1)
        item = [title, redir, platform, size]
        return self._process_item(item)

    def next(self):
        if len(self.produrls) == 0:
            self._fill_produrls()
        # we've got at least one product link
        # update products dictionary
        prodLink = self.produrls.pop(0)
        self.prods.update(self._get_dict_item(prodLink))

    def get_caturls(self):
        return self._caturls

    def get_produrls(self):
        return self._produrls

    def get_prods(self):
        return self.prods


if __name__ == "__main__":
    prodnr = 50
    try:
        crawl = Crawl("http://1st-download.com")
        for i in xrange(prodnr):
            crawl.next()
    except RequestError as msg:
        print "RequestError: %s" % msg
    except DataError as msg:
        print "DataError: %s" % msg
    except URLError as msg:
        print "URLError: %s" % msg

    #print "Categories..."
    #print crawl.get_caturls()
    #print "Products..."
    #print crawl.get_produrls()
    #print ""
    print crawl.get_prods()