Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import sys
- import mechanize
- from re import findall
- class UCrawler(object):
- def __init__(self, url=None, fname=None, debug=False):
- self.__url = url
- self.__file = fname
- self.__debug = debug
- self.__links = []
- self.__pages = []
- def run(self):
- try:
- if self.__url != None:
- self._crawl()
- elif self.__file != None:
- urls = self._getURLs()
- for url in urls:
- if not url.startswith('https://'):
- continue
- self.__url = url.split()[0]
- print self.__url
- self._crawl()
- except KeyboardInterrupt:
- sys.exit(' [!] KeyboardInterrupt')
- def _getURLs(self):
- try:
- with open(self.__file, 'r') as f:
- urls = f.readlines()
- return urls
- except Exception as e:
- self._showError(e)
- def _crawl(self):
- if self.__url.endswith("/"):
- self.__url = self.__url[:-1]
- try:
- print '\n [*] Crawling {}'.format(self.__url)
- self._getPages()
- for page in self.__pages:
- links = self._getLinks(page)
- for link in links:
- regex = '(%s\/\?.+|/\d{1,5}/)' % self.__url
- if findall(regex, link.url):
- with open("ignored_links.log", "a") as f:
- f.write(link.url + "\n")
- continue
- url = link.url.split('?PHPSESSID')[0]
- if url not in self.__links:
- if self.__debug:
- #print ' [TITLE] {}: {}'.format(link.text, url)
- pass
- self.__links.append(url)
- print '\n [+] Links: {}'.format(len(self.__links))
- self._save()
- self._clean()
- except Exception as e:
- self._showError(e)
- def _clean(self):
- self.__links = []
- self.__pages = []
- def _getLinks(self, url=None):
- if url:
- print "\n [*] Getting links from {}".format(url)
- else:
- url = self.__url
- try:
- br = mechanize.Browser()
- br.open(url)
- if findall('\/\d.+', url):
- url = url.rsplit('/', 1)[0]
- return br.links(url_regex=url)
- except Exception as e:
- self._showError(e)
- def _getPages(self):
- try:
- pages = []
- links = self._getLinks()
- for link in links:
- if findall('^\d{1,5}$', link.text):
- num = int(link.text)
- if num not in pages:
- pages.append(num)
- pages.sort()
- if len(pages) == 0:
- self.__pages.append(self.__url)
- return
- for i in range(pages[-1]):
- num = i * 40
- if num == 0:
- page = self.__url
- else:
- page = "{}/{}".format(self.__url, num)
- self.__pages.append(page)
- if self.__debug:
- print " [*] Pages:"
- for page in self.__pages:
- print "\t-", page
- except Exception as e:
- self._showError(e)
- def _save(self):
- xmlname = self._getXMLName()
- print ' [*] Creating XML file [{}]'.format(xmlname)
- header = """<?xml version="1.0" encoding="UTF-8"?>
- <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"
- xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
- xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9
- http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd">"""
- content = """
- <url>
- <loc>{}</loc>
- <priority>0.5</priority>
- <changefreq>daily</changefreq>
- <lastmod>2015-05-20T21:48:08+00:00</lastmod>
- </url>"""
- footer = "</urlset>"
- try:
- with open(xmlname, "w") as f:
- f.write(header)
- for link in self.__links:
- f.write(content.format(link))
- f.write(footer)
- except Exception as e:
- self._showError(e)
- def _getXMLName(self):
- xmlname = self.__url.rsplit('/', 1)[1] + '.xml'
- return xmlname
- def _showError(self, e):
- caller = sys._getframe().f_back.f_code.co_name
- if self.__debug:
- print ' [!] Error: {0} (function: {1})'.format(e, caller)
- with open("error.log", "a") as f:
- f.write("URL: {}\n".format(self.__url))
- f.write("ERROR: {}\n".format(e))
- f.write("METHOD: {}\n".format(caller))
- f.write("--"*40 + "\n")
- self._exit()
- def _exit(self):
- sys.exit()
- USAGE = '''
- Usage: python {} [options]
- Help Options:
- -h, --help Show help options
- Debug Options:
- --debug Debug mode
- Application Options:
- -u, --url=URL Set the URL to crawl
- -f, --file=FILE Set the file containing the URLs to crawl
- '''.format(sys.argv[0])
- def usage():
- sys.exit(USAGE)
- def parse():
- args = sys.argv
- options = {
- 'url' : None,
- 'fname' : None,
- 'debug' : False,
- }
- if '--debug' in args:
- options['debug'] = True
- args.pop(args.index('--debug'))
- if any((x in args for x in ['-h', '--help'])) or not (2 <= len(args) <= 3):
- usage()
- if len(args) == 2:
- if args[1].startswith('--url='):
- options['url'] = args[1].split('=', 1)[1]
- elif args[1].startswith('--file='):
- options['fname'] = args[1].split('=', 1)[1]
- else:
- usage()
- if len(args) == 3:
- if '-u' == args[1]:
- options['url'] = args[2]
- elif '-f' == args[1]:
- options['fname'] = args[2]
- else:
- usage()
- return options.get('url'), options.get('fname'), options.get('debug')
- def main():
- url, fname, debug = parse()
- uc = UCrawler(url=url, fname=fname, debug=debug)
- uc.run()
- if __name__ == '__main__':
- main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement