Advertisement
Guest User

Untitled

a guest
Oct 31st, 2014
185
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.98 KB | None | 0 0
  1. # -*- coding: utf-8 -*-
  2.  
  3. from bs4 import BeautifulSoup
  4. import urllib2
  5. import os
  6.  
  7. class Downloader(object):
  8.     CSS_PATH = 'css'
  9.     JS_PATH  = 'js'
  10.     IMG_PATH = 'img'
  11.  
  12.     def __init__(self, url):
  13.         self.url     = url # Valid url "/"
  14.         self.content = None
  15.         self.soup    = None
  16.  
  17.     def run(self):
  18.         self.soup = BeautifulSoup(self._get_page(self.url))
  19.        
  20.         self._create_directories()
  21.         self._get_html_files()
  22.         self._get_js_files()
  23.         self._get_css_files()
  24.  
  25.     def _create_directories(self):
  26.         if not os.path.exists(self.CSS_PATH):
  27.             os.makedirs(self.CSS_PATH)
  28.         if not os.path.exists(self.JS_PATH):
  29.             os.makedirs(self.JS_PATH)
  30.         if not os.path.exists(self.IMG_PATH):
  31.             os.makedirs(self.IMG_PATH)
  32.    
  33.     def _get_current_page(self):
  34.         content  = self._get_page(self.url)
  35.         self._save_file(path, content)
  36.    
  37.     def _search_local_urls(self):
  38.         urls = self._get_a_tags()
  39.        
  40.         for url in url:
  41.             if not url.startswith('http://') and not url.startswith('https://')
  42.                 url = self.url + url
  43.             if self._is_local_url(url):
  44.                 downloader = new Downloader(url)
  45.  
  46.     def _get_js_files(self):
  47.         js_tags = self._get_js_tags()
  48.  
  49.         for js_tag in js_tags:
  50.             js_url = js_tag.get('src')
  51.            
  52.             if not js_url.startswith('http://') and not js_url.startswith('https://'):
  53.                 js_url = self.url + js_url
  54.            
  55.             path      = self.JS_PATH+'/'
  56.             filename  = os.path.split(js_url)[1]
  57.             path     += filename
  58.             content   = self._get_page(js_url)
  59.  
  60.             self._save_file(path, content)
  61.  
  62.     def _get_css_files(self):
  63.         css_tags = self._get_css_tags()
  64.  
  65.         for css_tag in css_tags:
  66.             css_url = css_tag.get('href')
  67.  
  68.             if not css_url.startswith('http://') and not css_url.startswith('https://'):
  69.                 css_url = self.url + css_url
  70.  
  71.             path      = self.CSS_PATH+'/'
  72.             filename  = os.path.split(css_url)[1]
  73.             path     += filename
  74.             content   = self._get_page(css_url)
  75.  
  76.             self._save_file(path, content)
  77.    
  78.     def _get_a_tags(self):
  79.        a_tags = self.soup.find_all('a')
  80.        return a_tags
  81.    
  82.     def _get_css_tags(self):
  83.         css_tags = self.soup.find_all('link')
  84.         return css_tags
  85.  
  86.     def _get_js_tags(self):
  87.         js_tags = self.soup.find_all('script')
  88.         return js_tags
  89.  
  90.     def _get_page(self, url):
  91.         response = urllib2.urlopen(url)
  92.         html     = response.read()
  93.         return html
  94.  
  95.     def _is_local_url(self, url):
  96.         pattern = self.url
  97.         r = re.compile(pattern)
  98.        
  99.         return r.match(url)
  100.  
  101.     def _save_file(self, path, content):
  102.         f = open(path, 'w+')
  103.         f.write(content)
  104.         f.close()
  105.         return f
  106.        
  107. downloader = Downloader('http://getbootstrap.com/examples/navbar-static-top/')
  108. downloader.run()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement