Advertisement
Guest User

Untitled

a guest
Jan 29th, 2017
127
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 4.60 KB | None | 0 0
  1. #! Python
  2. # Goes to bookshelf and downloads blue box questions from the CFA textbooks
  3.  
  4. import requests, json, re, os
  5. from bs4 import BeautifulSoup
  6.  
  7. username = input('Please type in your E-mail: ')
  8. password = input('Please type in your password: ')
  9. Year = input('What year of books do you want?: ')
  10. Level = input('What Level? (I,II,III): ')
  11.  
  12. # Provide username and password
  13. payload = {
  14.     'user[email]': username,
  15.     'user[password]': password
  16. }
  17.  
  18. # Creating a folder for the documents
  19. startingLocation = os.path.abspath(os.curdir)
  20. location = startingLocation + '\CFACurriculumProblems'
  21. if not os.path.exists(location):
  22.     os.makedirs(location)
  23. os.chdir(location)
  24. print('ALL FILES WILL BE SAVED IN: %s' % location)
  25.  
  26. # Logging in and keeping a session open
  27. with requests.Session() as s:
  28.     p = s.post('https://jigsaw.vitalsource.com/login', params=payload)
  29.     Library = p.text
  30.     parsed_Library = json.loads(Library)['books']
  31.     my_regex = re.compile(str(Year) + r'\sCFA\sLevel\s' + Level + r'\s')
  32.     isbnList = []
  33.     # grabbing all the books to read
  34.     for book in parsed_Library:
  35.         title = book['title']
  36.         isbn = book['isbn']
  37.         if my_regex.search(title):
  38.             isbnList.append({'title':title, 'isbn':int(isbn)})
  39.     # Going to each book's table of contents first
  40.     for book in isbnList:
  41.         bookUrl = 'https://jigsaw.vitalsource.com/books/' + str(book['isbn']) + '/toc'
  42.         tableOfContents = s.get(bookUrl).text
  43.         parsed_tableOfContents = json.loads(tableOfContents)
  44.         readingList = []
  45.         readingRegex = re.compile(r'Reading\s{1,10}\d{1,3}')
  46.         newLocation = startingLocation + '\CFACurriculumProblems' + r'\ ' + book['title']
  47.         if not os.path.exists(newLocation):
  48.             os.makedirs(newLocation)
  49.         os.chdir(newLocation)
  50.         # Grabbing the links to all of the readings
  51.         for reading in parsed_tableOfContents:
  52.             if readingRegex.search(reading['title']):
  53.                 readingList.append(reading)
  54.         for subReading in readingList:
  55.             newLink = subReading['path']
  56.             newURL = 'https://jigsaw.vitalsource.com/books/' + str(book['isbn']) + '/epub' + str(newLink)
  57.             reading = s.get(newURL).text
  58.             soupReading = BeautifulSoup(reading, "html.parser")
  59.             for span_tag in soupReading.findAll('span'):
  60.                 span_tag.replace_with('')
  61.             title = subReading['title']
  62.             title = " ".join(re.findall("[a-zA-Z0-9.]+", title))
  63.             print('Now Scraping Reading: %s' % title)
  64.            
  65.             # === MODIFICATIONS: Creating dump file for analyzing the page source code ===
  66.             # dumpingFile = open('dump of %s.html' % title, 'wb')
  67.             # dumpingFile.write(soupReading.encode('UTF-8'))
  68.            
  69.             # Grabbing all of the blue boxes which are denoted by "figure" in the HTML
  70.             # Trying to also save them off as word documents, with both text and tables
  71.             figures = soupReading.findAll("figure", {"class": "example"})
  72.             if len(figures) >= 1:
  73.                 readingFile = open('[BlueBoxes] %s.html' % title, 'wb')
  74.                 for figure in figures:
  75.                     for image_tag in figure.findAll('img'):
  76.                         imageTagRegex = re.compile(r'(\.\.\/)?(images.*)')
  77.                         trueImageTag = imageTagRegex.search(image_tag['src']).group(2)
  78.                         image_tag['src'] = 'https://jigsaw.vitalsource.com/books/' + str(book['isbn']) + '/epub/OEBPS/' + trueImageTag
  79.                     readingFile.write(figure.encode('UTF-8'))
  80.                 readingFile.close()
  81.             else:
  82.                 continue
  83.  
  84.             # This part basically mimics the original code.
  85.             # === MODIFICATIONS: Finding practice questions and solutions ===
  86.             practiceQuestions = soupReading.findAll('section', id = re.compile("Review"))
  87.             if len(practiceQuestions) >= 1:
  88.                 readingFile = open('[Review] %s.html' % title, 'wb')
  89.                 for section in practiceQuestions:
  90.                     for image_tag in section.findAll('img'):
  91.                         imageTagRegex = re.compile(r'(\.\.\/)?(images.*)')
  92.                         trueImageTag = imageTagRegex.search(image_tag['src']).group(2)
  93.                         image_tag['src'] = 'https://jigsaw.vitalsource.com/books/' + str(book['isbn']) + '/epub/OEBPS/' + trueImageTag
  94.                     readingFile.write(section.encode('UTF-8'))
  95.                 readingFile.close()
  96.             else:
  97.                 continue            
  98.     print('Done! Enjoy!')
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement