Advertisement
Guest User

Untitled

a guest
Jul 2nd, 2017
83
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 12.86 KB | None | 0 0
  1. #! /usr/bin/env python3
  2. import datetime
  3. import sys
  4. import traceback
  5.  
  6. import lxml.html
  7. import os
  8. import re
  9. import requests
  10.  
  11.  
  12. class Settings:
  13.     def __init__(self):
  14.         self.school = 'ntnu'
  15.         self.base_url = 'https://{}.itslearning.com'.format(self.school)
  16.         self.include_assignment_answers = False
  17.         self.root_dir = os.path.abspath(os.path.join(os.path.curdir, 'Downloaded courses'))
  18.         self.session = requests.Session()
  19.  
  20.     def set_school_and_base_url(self, school: str):
  21.         self.school = school
  22.         self.base_url = 'https://{}.itslearning.com'.format(self.school)
  23.  
  24.  
  25. settings = Settings()
  26. session = requests.Session()
  27.  
  28.  
  29. def main():
  30.     console_settings_init()
  31.     console_login()
  32.     selected_urls = console_select_urls()
  33.     for selected_name, selected_url in selected_urls:
  34.         try:
  35.             download_course_or_project(selected_url)
  36.         except Exception:
  37.             print('failed to download the course/project {}'.format(selected_name))
  38.             cur_dir = os.path.join(settings.root_dir, selected_name)
  39.             file_path = os.path.join(cur_dir, 'errors.txt')
  40.             print('saving error log to {}'.format(file_path))
  41.             os.makedirs(cur_dir, exist_ok=True)
  42.             with open(file_path, 'a') as file:
  43.                 file.write('\r\n>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>\r\n')
  44.                 file.write('start error at {}\r\n'.format(datetime.datetime.now()))
  45.                 file.write(traceback.format_exc())
  46.                 file.write('\r\nend error at {}'.format(datetime.datetime.now()))
  47.                 file.write('\r\n>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>\r\n')
  48.  
  49.  
  50. def console_settings_init():
  51.     if re.match('[hH].*', input('Choose ntnu or hist: ')):
  52.         settings.set_school_and_base_url('hist')
  53.     print('You chose ' + settings.school)
  54.     if re.match('[yYjJ].*', input('Include assignment answers? y/n: ')):
  55.         settings.include_assignment_answers = True
  56.         print('Including assignment answers.')
  57.     else:
  58.         settings.include_assignment_answers = False
  59.         print('Not including assignment answers.')
  60.     new_path = input(
  61.         'Current location is "{}".\r\n'
  62.         'Type a new path to change it or just press enter to keep it:\r\n'.format(settings.root_dir))
  63.     if new_path:
  64.         settings.root_dir = new_path
  65.     print('Path is set to "{}".'.format(settings.root_dir))
  66.  
  67.  
  68. def console_login():
  69.     import getpass
  70.     logged_in = False
  71.     while not logged_in:
  72.         username = input('Brukernavn: ')
  73.         password = getpass.getpass('Passord: ')
  74.         logged_in = attempt_login(username, password)
  75.  
  76.  
  77. def attempt_login(username: str, password: str) -> bool:
  78.     form = get_form_from_page(session.get('https://innsida.ntnu.no/lms-' + settings.school))
  79.     login_url = 'https://idp.feide.no/simplesaml/module.php/feide/login.php' + form.action
  80.     data = get_values_from_form(form)
  81.     data['feidename'] = username.lower()
  82.     data['password'] = password
  83.     confirm_login_page = session.post(login_url, data=data)
  84.     logged_in = confirm_login(confirm_login_page)
  85.     return logged_in
  86.  
  87.  
  88. def get_form_from_page(page: requests.Response) -> lxml.html.FormElement:
  89.     tree = lxml.html.fromstring(page.content)
  90.     form = tree.forms[0]
  91.     if form.xpath('fieldset/select[@name="org"]'):
  92.         page = session.get(page.url + '&org=ntnu.no')
  93.         return get_form_from_page(page)
  94.     return form
  95.  
  96.  
  97. def get_values_from_form(form: lxml.html.FormElement) -> dict:
  98.     return {i.xpath("@name")[0]: i.xpath("@value")[0] for i in form.xpath(".//input[@name]") if i.xpath("@value")}
  99.  
  100.  
  101. def confirm_login(confirm_login_page: requests.Response) -> bool:
  102.     form = get_form_from_page(confirm_login_page)
  103.     try:
  104.         session.post(form.action, get_values_from_form(form))
  105.     except requests.exceptions.MissingSchema:
  106.         return False
  107.     if settings.school == 'hist':
  108.         hist_extra_login(confirm_login_page)
  109.     return True
  110.  
  111.  
  112. def hist_extra_login(confirm_login_page: requests.Response):
  113.     confirm_login_page2 = post_form_from_page(confirm_login_page)
  114.     confirm_login_page3 = post_form_from_page(confirm_login_page2)
  115.     tree = lxml.html.fromstring(confirm_login_page3.content)
  116.     data = {
  117.         '__EVENTTARGET': 'ctl00$ContentPlaceHolder1$federatedLoginButtons$ctl00$ctl00',
  118.         '__EVENTARGUMENT': '',
  119.         '__VIEWSTATE': tree.xpath('//input[@name="__VIEWSTATE"]/@value')[0],
  120.         '__VIEWSTATEGENERATOR': '90059987',
  121.         '__EVENTVALIDATION': tree.xpath('//input[@name="__EVENTVALIDATION"]/@value')[0],
  122.         'ctl00$ContentPlaceHolder1$Username$input': '',
  123.         'ctl00$ContentPlaceHolder1$Password$input': '',
  124.         'ctl00$ContentPlaceHolder1$showNativeLoginValueField': '',
  125.         'ctl00$language_internal$H': '0'
  126.     }
  127.     page = session.post('https://hist.itslearning.com/Index.aspx', data=data)
  128.     confirm_login_page4 = post_form_from_page(page)
  129.     post_form_from_page(confirm_login_page4)
  130.  
  131.  
  132. def post_form_from_page(page: requests.Response) -> requests.Response:
  133.     form = get_form_from_page(page)
  134.     return session.post(form.action, get_values_from_form(form))
  135.  
  136.  
  137. def console_select_urls() -> list:
  138.     choices = get_courses_and_projects()
  139.     names = list(choices)
  140.     print('Found the following favorite courses and projects:')
  141.     for index, course_name in enumerate(names):
  142.         print('{}: {}'.format(index, course_name))
  143.     print('all: all')
  144.     answer = input('List the ones you want to download. Eg. 2 5 6 7 12 3. Or type all\n: ')
  145.     if answer == 'all':
  146.         selected_urls = list(choices.items())
  147.     else:
  148.         selected_urls = [(names[int(i)], choices[names[int(i)]]) for i in answer.split()]
  149.     return selected_urls
  150.  
  151.  
  152. def get_courses_and_projects() -> dict:
  153.     courses = get_courses()
  154.     projects = get_projects()
  155.     return {
  156.         **{
  157.             course_name: settings.base_url + "/main.aspx?CourseID=" + course_id
  158.             for course_name, course_id in courses.items()
  159.         },
  160.         **{
  161.             project_name: settings.base_url + "/main.aspx?ProjectID=" + project_id
  162.             for project_name, project_id in projects.items()
  163.         }
  164.     }
  165.  
  166.  
  167. def get_courses() -> dict:
  168.     return retrieve_topmenu_list(settings.base_url + "/TopMenu/TopMenu/GetCourses")
  169.  
  170.  
  171. def get_projects() -> dict:
  172.     return retrieve_topmenu_list(settings.base_url + "/TopMenu/TopMenu/GetProjects")
  173.  
  174.  
  175. def retrieve_topmenu_list(url: str) -> dict:
  176.     page = session.get(url)
  177.     tree = lxml.html.fromstring(page.content)
  178.     return {
  179.         item.xpath('@data-title')[0]: item.xpath('a/@href')[0].split('=')[-1]
  180.         for item in tree.xpath('//li')
  181.         if item.xpath('@data-title') and item.xpath('a/@href')
  182.     }
  183.  
  184.  
  185. def download_course_or_project(url: str):
  186.     page = session.get(url)
  187.     url = page.url
  188.     tree = lxml.html.fromstring(page.content)
  189.     folder_id = re.search('var contentAreaRootFolderId = \"item\" \+ ([0-9]+);',
  190.                           tree.xpath('//aside/script')[0].text).groups()[0]
  191.     title = tree.xpath('//h1[@class="treemenu-title"]/span/text()')[0]
  192.     directory = os.path.join(settings.root_dir, title)
  193.     download_folder(directory, url, folder_id)
  194.  
  195.  
  196. def download_folder(directory: str, url: str, folder_id: str, excluded_folders: set = set()):
  197.     page = session.get('{}&id=item{}'.format(url, folder_id))
  198.     tree = lxml.html.fromstring(page.content)
  199.     os.makedirs(directory, exist_ok=True)
  200.     for link_element in tree.xpath('//a'):
  201.         link_type, link_tail = link_element.xpath('@href')[0].split('/')[-2:]
  202.         link_url = '{}/{}/{}'.format(settings.base_url, link_type, link_tail)
  203.         link_name = "".join(char if char.isalnum() else '_' for char in link_element.xpath('.//text()')[0].strip())
  204.         if link_type == 'Folder' or link_type == 'ContentArea':
  205.             excluded_folders.add(folder_id)
  206.             new_directory = os.path.join(directory, link_name)
  207.             folder_id = re.search('FolderID=([0-9]+)', link_tail).groups()[0]
  208.             if folder_id not in excluded_folders:
  209.                 download_folder(new_directory, url, folder_id, excluded_folders)
  210.         elif link_type == 'File':
  211.             download_from_file_page(directory, link_url)
  212.         elif link_type == 'essay':
  213.             download_from_essay_page(directory, link_url)
  214.         elif link_type == 'note':
  215.             save_as_html(directory, link_url, link_name)
  216.         elif link_type == 'LearningToolElement':
  217.             save_link(directory, link_url, link_name)
  218.         elif link_type == '':
  219.             pass
  220.         else:
  221.             print('Will not download: {}, (is a {})'.format(os.path.join(directory, link_name), link_type))
  222.  
  223.  
  224. def save_as_html(directory: str, link_url: str, name: str):
  225.     page_to_download = session.get(link_url).content
  226.     with open(os.path.join(directory, name + '.html'), 'wb') as downloaded_file:
  227.         downloaded_file.write(page_to_download)
  228.     print('Saved {} as a html file'.format(os.path.join(directory, name)))
  229.  
  230.  
  231. def save_link(directory: str, link_url: str, name: str):
  232.     tree = get_tree(get_tree(link_url).xpath('//iframe/@src')[0])
  233.     try:
  234.         link = tree.xpath('//section[@class="file-link-link"]/a')[0]
  235.     except IndexError:
  236.         print("could not find download link in page {}, downloading page instead.".format(link_url))
  237.         save_as_html(directory, link_url, name)
  238.         return
  239.     if 'download' in link.keys():
  240.         download_file(directory, link.get('href'))
  241.     else:
  242.         with open(os.path.join(directory, name + '.txt'), 'w') as downloaded_file:
  243.             downloaded_file.write(link.get('href'))
  244.         print('Saved {} as a html file'.format(os.path.join(directory, name)))
  245.  
  246.  
  247. def get_tree(url):
  248.     return lxml.html.fromstring(session.get(url).content)
  249.  
  250.  
  251. def download_from_essay_page(directory: str, link_url: str):
  252.     essay_page = session.get(link_url)
  253.     tree = lxml.html.fromstring(essay_page.content)
  254.     download_urls = tree.xpath(
  255.         '//div[@id="EssayDetailedInformation_FileListWrapper_FileList"]/ul/li/a/@href')
  256.     if settings.include_assignment_answers:
  257.         download_urls += tree.xpath('//div[@id="DF_FileList"]/ul/li/a[@class="ccl-iconlink"]/@href')
  258.     for download_url in download_urls:
  259.         download_file(directory, download_url)
  260.  
  261.  
  262. def download_from_file_page(directory: str, link_url: str):
  263.     try:
  264.         file_page = session.get(link_url)
  265.         download_url = settings.base_url + lxml.html.fromstring(file_page.content).xpath(
  266.             '//a[@class="ccl-button ccl-button-color-green ccl-button-submit"]/@href'
  267.         )[0][2:]
  268.     except Exception:
  269.         print('failed to download the file from {} in {}'.format(link_url, directory))
  270.         file_path = os.path.join(directory, 'errors.txt')
  271.         print('saving error log to {}'.format(file_path))
  272.         with open(file_path, 'a') as file:
  273.             file.write('\r\n>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>\r\n')
  274.             file.write('start error from {} in {} at {}\r\n'.format(link_url, directory, datetime.datetime.now()))
  275.             file.write(traceback.format_exc())
  276.             file.write('\r\nend error from {} in {} at {}'.format(link_url, directory, datetime.datetime.now()))
  277.             file.write('\r\n>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>\r\n')
  278.         return
  279.     download_file(directory, download_url)
  280.  
  281.  
  282. def download_file(directory: str, download_url: str):
  283.     try:
  284.         download = session.get(download_url, stream=True)
  285.         raw_file_name = re.findall('filename="(.+)"', download.headers['content-disposition'])
  286.         if raw_file_name:
  287.             raw_file_name = raw_file_name[0]
  288.         else:
  289.             return
  290.         filename = raw_file_name.encode('iso-8859-1').decode()
  291.         filepath = os.path.join(directory, filename)
  292.         with open(filepath, 'wb') as downloaded_file:
  293.             for chunk in download:
  294.                 downloaded_file.write(chunk)
  295.         print('Downloaded: ', filepath)
  296.     except Exception:
  297.         print('failed to download the file from {} in {}'.format(download_url, directory))
  298.         file_path = os.path.join(directory, 'errors.txt')
  299.         print('saving error log to {}'.format(file_path))
  300.         with open(file_path, 'a') as file:
  301.             file.write('\r\n>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>\r\n')
  302.             file.write('start error from {} in {} at {}\r\n'.format(download_url, directory, datetime.datetime.now()))
  303.             file.write(traceback.format_exc())
  304.             file.write('\r\nend error from {} in {} at {}'.format(download_url, directory, datetime.datetime.now()))
  305.             file.write('\r\n>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>\r\n')
  306.  
  307.  
  308. if __name__ == '__main__':
  309.     if sys.version_info.major == 3 and sys.version_info.minor >= 5:
  310.         main()
  311.     else:
  312.         print('This script is made for python 3.5 (or later)')
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement