Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # -*- coding: utf-8 -*-
- from bs4 import BeautifulSoup
- import urllib2
- import os
- class Downloader(object):
- CSS_PATH = 'css'
- JS_PATH = 'js'
- IMG_PATH = 'img'
- def __init__(self, url):
- self.url = url # Valid url "/"
- self.content = None
- self.soup = None
- def run(self):
- self.soup = BeautifulSoup(self._get_page(self.url))
- self._create_directories()
- self._get_html_files()
- self._get_js_files()
- self._get_css_files()
- def _create_directories(self):
- if not os.path.exists(self.CSS_PATH):
- os.makedirs(self.CSS_PATH)
- if not os.path.exists(self.JS_PATH):
- os.makedirs(self.JS_PATH)
- if not os.path.exists(self.IMG_PATH):
- os.makedirs(self.IMG_PATH)
- def _get_current_page(self):
- content = self._get_page(self.url)
- self._save_file(path, content)
- def _search_local_urls(self):
- urls = self._get_a_tags()
- for url in url:
- if not url.startswith('http://') and not url.startswith('https://')
- url = self.url + url
- if self._is_local_url(url):
- downloader = new Downloader(url)
- def _get_js_files(self):
- js_tags = self._get_js_tags()
- for js_tag in js_tags:
- js_url = js_tag.get('src')
- if not js_url.startswith('http://') and not js_url.startswith('https://'):
- js_url = self.url + js_url
- path = self.JS_PATH+'/'
- filename = os.path.split(js_url)[1]
- path += filename
- content = self._get_page(js_url)
- self._save_file(path, content)
- def _get_css_files(self):
- css_tags = self._get_css_tags()
- for css_tag in css_tags:
- css_url = css_tag.get('href')
- if not css_url.startswith('http://') and not css_url.startswith('https://'):
- css_url = self.url + css_url
- path = self.CSS_PATH+'/'
- filename = os.path.split(css_url)[1]
- path += filename
- content = self._get_page(css_url)
- self._save_file(path, content)
- def _get_a_tags(self):
- a_tags = self.soup.find_all('a')
- return a_tags
- def _get_css_tags(self):
- css_tags = self.soup.find_all('link')
- return css_tags
- def _get_js_tags(self):
- js_tags = self.soup.find_all('script')
- return js_tags
- def _get_page(self, url):
- response = urllib2.urlopen(url)
- html = response.read()
- return html
- def _is_local_url(self, url):
- pattern = self.url
- r = re.compile(pattern)
- return r.match(url)
- def _save_file(self, path, content):
- f = open(path, 'w+')
- f.write(content)
- f.close()
- return f
- downloader = Downloader('http://getbootstrap.com/examples/navbar-static-top/')
- downloader.run()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement