UCrawler

import sys
import mechanize
from re import findall

class UCrawler(object):
    def __init__(self, url=None, fname=None, debug=False):
        self.__url = url
        self.__file = fname
        self.__debug = debug
        self.__links = []
        self.__pages = []

    def run(self):
        try:
            if self.__url != None:
                self._crawl()

            elif self.__file != None:
                urls = self._getURLs()

                for url in urls:
                    if not url.startswith('https://'):
                        continue

                    self.__url = url.split()[0]
                    print self.__url
                    self._crawl()

        except KeyboardInterrupt:
            sys.exit(' [!] KeyboardInterrupt')

    def _getURLs(self):
        try:
            with open(self.__file, 'r') as f:
                urls = f.readlines()
                return urls

        except Exception as e:
            self._showError(e)

    def _crawl(self):
        if self.__url.endswith("/"):
            self.__url = self.__url[:-1]

        try:
            print '\n [*] Crawling {}'.format(self.__url)

            self._getPages()

            for page in self.__pages:
                links = self._getLinks(page)

                for link in links:

                    regex = '(%s\/\?.+|/\d{1,5}/)' % self.__url
                    if findall(regex, link.url):
                        with open("ignored_links.log", "a") as f:
                            f.write(link.url + "\n")
                        continue

                    url = link.url.split('?PHPSESSID')[0]
                    if url not in self.__links:
                        if self.__debug:
                            #print ' [TITLE] {}: {}'.format(link.text, url)
                            pass

                        self.__links.append(url)

            print '\n [+] Links: {}'.format(len(self.__links))
            self._save()
            self._clean()

        except Exception as e:
            self._showError(e)

    def _clean(self):
        self.__links = []
        self.__pages = []

    def _getLinks(self, url=None):
        if url:
            print "\n [*] Getting links from {}".format(url)
        else:
            url = self.__url

        try:
            br = mechanize.Browser()
            br.open(url)

            if findall('\/\d.+', url):
                url = url.rsplit('/', 1)[0]

            return br.links(url_regex=url)

        except Exception as e:
            self._showError(e)

    def _getPages(self):
        try:
            pages = []
            links = self._getLinks()

            for link in links:
                if findall('^\d{1,5}$', link.text):
                    num = int(link.text)

                    if num not in pages:
                        pages.append(num)

            pages.sort()

            if len(pages) == 0:
                self.__pages.append(self.__url)
                return

            for i in range(pages[-1]):
                num = i * 40

                if num == 0:
                    page = self.__url
                else:
                    page = "{}/{}".format(self.__url, num)

                self.__pages.append(page)

            if self.__debug:
                print " [*] Pages:"
                for page in self.__pages:
                    print "\t-", page

        except Exception as e:
            self._showError(e)

    def _save(self):
        xmlname = self._getXMLName()
        print ' [*] Creating XML file [{}]'.format(xmlname)

        header = """<?xml version="1.0" encoding="UTF-8"?>
            <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"
            xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
            xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9
            http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd">"""

        content = """
            <url>
                <loc>{}</loc>
                <priority>0.5</priority>
                <changefreq>daily</changefreq>
                <lastmod>2015-05-20T21:48:08+00:00</lastmod>
            </url>"""

        footer = "</urlset>"

        try:
            with open(xmlname, "w") as f:
                f.write(header)

                for link in self.__links:
                    f.write(content.format(link))

                f.write(footer)

        except Exception as e:
            self._showError(e)

    def _getXMLName(self):
        xmlname = self.__url.rsplit('/', 1)[1] + '.xml'
        return xmlname

    def _showError(self, e):
        caller = sys._getframe().f_back.f_code.co_name

        if self.__debug:
            print ' [!] Error: {0} (function: {1})'.format(e, caller)

        with open("error.log", "a") as f:
            f.write("URL: {}\n".format(self.__url))
            f.write("ERROR: {}\n".format(e))
            f.write("METHOD: {}\n".format(caller))
            f.write("--"*40 + "\n")

        self._exit()

    def _exit(self):
        sys.exit()

USAGE = '''
Usage: python {} [options]

Help Options:
  -h, --help                Show help options

Debug Options:
  --debug                   Debug mode

Application Options:
  -u, --url=URL             Set the URL to crawl
  -f, --file=FILE           Set the file containing the URLs to crawl
'''.format(sys.argv[0])

def usage():
    sys.exit(USAGE)

def parse():
    args = sys.argv

    options = {
        'url' : None,
        'fname' : None,
        'debug' : False,
    }

    if '--debug' in args:
        options['debug'] = True
        args.pop(args.index('--debug'))

    if any((x in args for x in ['-h', '--help'])) or not (2 <= len(args) <= 3):
        usage()

    if len(args) == 2:
        if args[1].startswith('--url='):
            options['url'] = args[1].split('=', 1)[1]
        elif args[1].startswith('--file='):
            options['fname'] = args[1].split('=', 1)[1]
        else:
            usage()

    if len(args) == 3:
        if '-u' == args[1]:
            options['url'] = args[2]
        elif '-f' == args[1]:
            options['fname'] = args[2]
        else:
            usage()

    return options.get('url'), options.get('fname'), options.get('debug')


def main():
    url, fname, debug = parse()

    uc = UCrawler(url=url, fname=fname, debug=debug)
    uc.run()

if __name__ == '__main__':
    main()