Guest User

desire2download.py

a guest
Nov 26th, 2015
85
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 12.69 KB | None | 0 0
  1. #!/usr/bin/env python
  2. # encoding: utf-8
  3. """
  4. desire2download.py
  5. Copyright 2012 Stephen Holiday
  6. Licensed under the Apache License, Version 2.0 (the "License");
  7. you may not use this file except in compliance with the License.
  8. You may obtain a copy of the License at
  9.   http://www.apache.org/licenses/LICENSE-2.0
  10. Unless required by applicable law or agreed to in writing, software
  11. distributed under the License is distributed on an "AS IS" BASIS,
  12. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. See the License for the specific language governing permissions and
  14. limitations under the License.
  15. """
  16.  
  17. import re
  18. import os
  19. import socket
  20. import urllib2
  21. import mechanize
  22. import BeautifulSoup
  23. from errno import EEXIST
  24.  
  25. import sys
  26.  
  27. reload(sys)
  28. sys.setdefaultencoding("utf-8")
  29.  
  30.  
  31. class AuthError(Exception):
  32.     """Raised when login credentials fail."""
  33.     pass
  34.  
  35.  
  36. class Desire2Download(object):
  37.     base_url = 'https://learn.uwaterloo.ca/d2l/lp/homepage/home.d2l?ou=6606'
  38.     process_login = 'https://learn.uwaterloo.ca/d2l/lp/auth/login/ProcessLoginActions.d2l'
  39.     cas_login = 'https://cas.uwaterloo.ca/cas/login?service=https%3a%2f%2flearn.uwaterloo.ca%2fd2l%2fcustom%2fcas%3ftarget%3d%252fd2l%252fhome'
  40.     ping_url = 'http://jobminestats.appspot.com/Ping/ag5zfmpvYm1pbmVzdGF0c3IMCxIFUGl4ZWwYuRcM.gif'
  41.  
  42.     def __init__(self, username, password, ignore_re=None, retries=3, skip_existing=True):
  43.         self.username = username
  44.         self.password = password
  45.         self.ignore_re = ignore_re
  46.         self.retries = retries
  47.         self.skip_existing = skip_existing
  48.  
  49.         self.br = mechanize.Browser(factory=mechanize.RobustFactory())
  50.         self.br.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko')]
  51.         self.br.set_handle_refresh(True)
  52.         self.br.set_handle_redirect(True)
  53.         self.br.set_handle_referer(True)
  54.  
  55.         self.br.open(self.ping_url).read()
  56.  
  57.     def retry(f):
  58.         """Decorator to retry upon timeout. D2L is slow."""
  59.  
  60.         def retry_it(self, *args, **kwargs):
  61.             attempts = 0
  62.             while attempts < self.retries:
  63.                 try:
  64.                     return f(self, *args, **kwargs)
  65.                 except urllib2.URLError as e:
  66.                     if isinstance(e.reason, socket.timeout):
  67.                         attempts += 1
  68.                         if attempts >= self.retries:
  69.                             print "Timeout, out of retries."
  70.                             raise e
  71.                         print "Timeout, retrying..."
  72.                     else:
  73.                     # Not a timeout, raise exception
  74.                         print "Unknown exception:", e
  75.                         raise e
  76.  
  77.         return retry_it
  78.  
  79.     @retry
  80.     def login(self):
  81.         print 'Logging In...'
  82.         self.br.open(self.cas_login)
  83.         self.br.select_form(nr=0)
  84.         self.br['username'] = self.username
  85.         self.br['password'] = self.password
  86.         response = self.br.submit().read()
  87.         if "Your userid and/or your password are incorrect" in response:
  88.             raise AuthError("Your userid and/or your password are incorrect.")
  89.         self.br.open(self.process_login)
  90.         self.br.open('https://learn.uwaterloo.ca/d2l/lms/news/main.d2l?ou=6606&d2l_change=0')
  91.         print 'Logged In'
  92.  
  93.     def get_course_links(self):
  94.         print 'Finding courses...'
  95.         links = []
  96.         urls = []
  97.         for link in self.br.links():
  98.             link.text = link.text if link.text else ""
  99.             print link.text
  100.             matches = re.match('[A-Z]+ [0-9A-Za-z/\s]{2,45} - [A-Z][a-z]+ 20[0-9]{2}', link.text)
  101.             if matches is not None and link.url not in urls:
  102.                 links.append(link)
  103.                 urls.append(link.url)
  104.         return links
  105.  
  106.     def find_module_content(self, content_link, document_tree, path_to_root, top_modules, depth):
  107.         depth += 1
  108.         for module in top_modules:
  109.             page = self.br.open(content_link.absolute_url + '?itemIdentifier=' + module['data-key']).read()
  110.             soup = BeautifulSoup.BeautifulSoup(page)
  111.             module_content = soup.find('div', 'd2l-page-main-padding')
  112.  
  113.             ## Update path_to_root
  114.             header = module_content.find('h1')
  115.             if header is None:
  116.                 continue
  117.             heading = header.getText()
  118.  
  119.             section_node = new_dir(sanitize_string(heading))
  120.             temp_path = path_to_root
  121.             #crawl down the document tree to the correct location
  122.             for i in range(depth):
  123.                 temp_path = temp_path[-1]['children']
  124.  
  125.             temp_path.append(section_node)
  126.             path_to_module = temp_path[-1]
  127.  
  128.             is_sub_dir = False
  129.             for node in module_content.findAll('li', 'd2l-datalist-item'):
  130.                 dir_header = node.find('div', 'd2l-collapsepane')
  131.  
  132.                 if dir_header is None:
  133.                     #There can be restrictions on files being downloaded, so check first
  134.                     d2l_link = node.find('a', 'd2l-link')
  135.                     if not is_sub_dir and d2l_link:
  136.                         file_node = node_from_link(d2l_link)
  137.                         path_to_module['children'].append(file_node)
  138.                 else:
  139.                     is_sub_dir = True
  140.  
  141.             sub_modules = module.findAll('li', 'd2l-le-TreeAccordionItem')
  142.  
  143.             self.find_module_content(content_link, document_tree, path_to_root, sub_modules, depth)
  144.         return document_tree
  145.  
  146.  
  147.     @retry
  148.     def get_course_documents(self, link, course_name):
  149.         """Produce a tree of documents for the course.
  150.        Args:
  151.            link (str): A url to the course's page on d2l.
  152.            course_name (str): The name of the course.
  153.        Returns:
  154.            A dict representing a tree:
  155.            {
  156.                'type': Either 'file' or 'dir',
  157.                'name': A string.
  158.                'url': Url to the file download (if file).
  159.                'children': A list of children nodes (if a dir).
  160.            }
  161.        """
  162.         self.br.open(link)                                      # Go to course page
  163.         content_link = self.br.links(text='Content').next()     # Get content link
  164.         page = self.br.follow_link(content_link).read()         # Go to content page
  165.         soup = BeautifulSoup.BeautifulSoup(page)
  166.         contents = soup.find('ul', 'd2l-le-TreeAccordion')
  167.  
  168.         ## Initial document tree
  169.         document_tree = new_dir(course_name)
  170.         ## Keeps track of current location in tree
  171.         path_to_root = [document_tree]
  172.  
  173.         all_modules = contents.findAll('li', 'd2l-le-TreeAccordionItem-Root')
  174.         modules = [a for a in all_modules if 'ContentObject.Module' in a['data-key']]
  175.  
  176.         return self.find_module_content(content_link, document_tree, path_to_root, modules, 0)
  177.  
  178.     def download_tree(self, root, _path=None):
  179.         """Downloads the entire file tree
  180.        Args:
  181.            root: A dictionary containing the file tree.
  182.            _path: A list representing the path (relative to current dir) to
  183.                download to. Items in list are strings.
  184.        """
  185.         if not _path:
  186.             _path = []
  187.         if root['type'] == 'dir':
  188.             path = _path[:]
  189.             path.append(root['name'])
  190.             for node in root['children']:
  191.                 self.download_tree(node, path)
  192.         else:
  193.             path = '/'.join(map(lambda x: x.replace('/', '-'), _path))
  194.             self.download_file(root['name'], root['url'], path)
  195.  
  196.     def download_file(self, title, url, path):
  197.         """Downloads a file to the specified directory.
  198.        Args:
  199.            title (str): Name of the file.
  200.            url (str): Address to the direct link.
  201.            path (str): Relative path of file to make.
  202.        """
  203.         try:
  204.             os.makedirs(path)
  205.         except OSError as e:
  206.             if e.errno != EEXIST:
  207.                 raise
  208.  
  209.         #Mechanize pukes trying to open the url sometimes...
  210.         try:
  211.             info = self.br.open(url).info()
  212.             #D2L hides the content type here... so we have to do a little bit more work
  213.             if 'content-disposition' in info:
  214.                 name = info.dict['content-disposition'].split(';')[-1]
  215.                 extension = name[name.rfind("."): -1]
  216.             else:
  217.                 extension = '.' + info.subtype
  218.             filename = title + extension
  219.         except ValueError:
  220.             #maybe better to just return?
  221.             filename = title + ".pdf"
  222.  
  223.         for r in self.ignore_re:
  224.             if r.match(filename) is not None:
  225.                 print 'Skipping %s because it matches ignore regex "%s"' % (filename, r.pattern)
  226.                 return
  227.  
  228.         path_and_filename = '%s/%s' % (path, filename.strip('/'))
  229.         if os.path.isdir(os.path.join(os.getcwd(), path_and_filename)): # Handle empty file names
  230.             print ' X %s is a directory, not a file. Skipping.' % path_and_filename
  231.         elif os.path.isfile(
  232.                 os.path.join(os.getcwd(), path_and_filename)) and self.skip_existing:  # TODO Can we make this smarter?
  233.             print ' - %s (Already Saved)' % path_and_filename
  234.         else:
  235.             try:
  236.                 print ' + %s' % path_and_filename
  237.                 self.br.retrieve(url, path_and_filename, self._progress_bar)
  238.             except KeyboardInterrupt:
  239.                 # delete the file on a keyboard interrupt
  240.                 if os.path.exists(path_and_filename):
  241.                     os.remove(path_and_filename)
  242.                 raise
  243.             except urllib2.HTTPError, e:
  244.                 if e.code == 404:
  245.                     print " X File does not exist: %s" % filename.strip('/')
  246.                 else:
  247.                     print " X HTTP error %s for: %s" % (e.code, filename.strip('/'))
  248.             except Exception:
  249.                 # otherwise raise the error
  250.                 if os.path.exists(path_and_filename):
  251.                     os.remove(path_and_filename)
  252.                 else:
  253.                     raise
  254.  
  255.     def _progress_bar(self, block_num, bs, size):
  256.         """
  257.            Stolen from https://github.com/KartikTalwar/Coursera/blob/master/coursera.py
  258.        """
  259.         if size > 0:
  260.             if size % bs != 0:
  261.                 block_count = size / bs + 1
  262.             else:
  263.                 block_count = size / bs
  264.  
  265.             fraction = block_num * 1.0 / block_count
  266.             width = 50
  267.  
  268.             stars = '*' * int(width * fraction)
  269.             spaces = ' ' * (width - len(stars))
  270.             progress = ' ' * 3 + '%s [%s%s] (%s%%)' % (convert_bytes(size), stars, spaces, int(fraction * 100))
  271.  
  272.             if fraction * 100 < 100:
  273.                 sys.stdout.write(progress)
  274.  
  275.                 if block_num < block_count:
  276.                     sys.stdout.write('\r')
  277.                 else:
  278.                     sys.stdout.write('\n')
  279.             else:
  280.                 sys.stdout.write(' ' * int(width * 1.5) + '\r')
  281.                 sys.stdout.flush()
  282.  
  283.  
  284. def convert_bytes(byte_amt):
  285.     """
  286.        Stolen from http://www.5dollarwhitebox.org/drupal/node/84
  287.    """
  288.     byte_amt = float(byte_amt)
  289.     if byte_amt >= 1099511627776:
  290.         terabytes = byte_amt / 1099511627776
  291.         size = '%.2fT' % terabytes
  292.     elif byte_amt >= 1073741824:
  293.         gigabytes = byte_amt / 1073741824
  294.         size = '%.2fG' % gigabytes
  295.     elif byte_amt >= 1048576:
  296.         megabytes = byte_amt / 1048576
  297.         size = '%.2fM' % megabytes
  298.     elif byte_amt >= 1024:
  299.         kilobytes = byte_amt / 1024
  300.         size = '%.2fK' % kilobytes
  301.     else:
  302.         size = '%.2fb' % byte_amt
  303.     return size
  304.  
  305.  
  306. def sanitize_string(string):
  307.     return "".join([x for x in string if x.isalnum() or x.isspace()])
  308.  
  309.  
  310. def node_from_link(d2l_link):
  311.     name = sanitize_string(d2l_link.getText())
  312.     try:
  313.         section_number = re.search('/content/([0-9]+)', d2l_link['href']).group(1)
  314.         content_number = re.search('/viewContent/([0-9]+)', d2l_link['href']).group(1)
  315.         link_href = 'https://learn.uwaterloo.ca/d2l/le/content/%s/topics/files/download/%s/DirectFileTopicDownload' % (
  316.             section_number, content_number)
  317.         return new_file(name, link_href)
  318.     except AttributeError:
  319.         #The link isn't associated with Learn, so take the href as is
  320.         return new_file(name, d2l_link['href'])
  321.  
  322.  
  323. def new_dir(name):
  324.     node = _new_node('dir', name)
  325.     node['children'] = []
  326.     return node
  327.  
  328.  
  329. def new_file(name, url):
  330.     node = _new_node('file', name)
  331.     node['url'] = url
  332.     return node
  333.  
  334.  
  335. def _new_node(node_type, name):
  336.     return {
  337.         'type': node_type,
  338.         'name': name
  339.     }
Add Comment
Please, Sign In to add comment