Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #! Python
- # Goes to bookshelf and downloads blue box questions from the CFA textbooks
- import requests, json, re, os
- from bs4 import BeautifulSoup
- username = input('Please type in your E-mail: ')
- password = input('Please type in your password: ')
- Year = input('What year of books do you want?: ')
- Level = input('What Level? (I,II,III): ')
- # Provide username and password
- payload = {
- 'user[email]': username,
- 'user[password]': password
- }
- # Creating a folder for the documents
- startingLocation = os.path.abspath(os.curdir)
- location = startingLocation + '\CFACurriculumProblems'
- if not os.path.exists(location):
- os.makedirs(location)
- os.chdir(location)
- print('ALL FILES WILL BE SAVED IN: %s' % location)
- # Logging in and keeping a session open
- with requests.Session() as s:
- p = s.post('https://jigsaw.vitalsource.com/login', params=payload)
- Library = p.text
- parsed_Library = json.loads(Library)['books']
- my_regex = re.compile(str(Year) + r'\sCFA\sLevel\s' + Level + r'\s')
- isbnList = []
- # grabbing all the books to read
- for book in parsed_Library:
- title = book['title']
- isbn = book['isbn']
- if my_regex.search(title):
- isbnList.append({'title':title, 'isbn':int(isbn)})
- # Going to each book's table of contents first
- for book in isbnList:
- bookUrl = 'https://jigsaw.vitalsource.com/books/' + str(book['isbn']) + '/toc'
- tableOfContents = s.get(bookUrl).text
- parsed_tableOfContents = json.loads(tableOfContents)
- readingList = []
- readingRegex = re.compile(r'Reading\s{1,10}\d{1,3}')
- newLocation = startingLocation + '\CFACurriculumProblems' + r'\ ' + book['title']
- if not os.path.exists(newLocation):
- os.makedirs(newLocation)
- os.chdir(newLocation)
- # Grabbing the links to all of the readings
- for reading in parsed_tableOfContents:
- if readingRegex.search(reading['title']):
- readingList.append(reading)
- for subReading in readingList:
- newLink = subReading['path']
- newURL = 'https://jigsaw.vitalsource.com/books/' + str(book['isbn']) + '/epub' + str(newLink)
- reading = s.get(newURL).text
- soupReading = BeautifulSoup(reading, "html.parser")
- for span_tag in soupReading.findAll('span'):
- span_tag.replace_with('')
- title = subReading['title']
- title = " ".join(re.findall("[a-zA-Z0-9.]+", title))
- print('Now Scraping Reading: %s' % title)
- # === MODIFICATIONS: Creating dump file for analyzing the page source code ===
- # dumpingFile = open('dump of %s.html' % title, 'wb')
- # dumpingFile.write(soupReading.encode('UTF-8'))
- # Grabbing all of the blue boxes which are denoted by "figure" in the HTML
- # Trying to also save them off as word documents, with both text and tables
- figures = soupReading.findAll("figure", {"class": "example"})
- if len(figures) >= 1:
- readingFile = open('[BlueBoxes] %s.html' % title, 'wb')
- for figure in figures:
- for image_tag in figure.findAll('img'):
- imageTagRegex = re.compile(r'(\.\.\/)?(images.*)')
- trueImageTag = imageTagRegex.search(image_tag['src']).group(2)
- image_tag['src'] = 'https://jigsaw.vitalsource.com/books/' + str(book['isbn']) + '/epub/OEBPS/' + trueImageTag
- readingFile.write(figure.encode('UTF-8'))
- readingFile.close()
- else:
- continue
- # This part basically mimics the original code.
- # === MODIFICATIONS: Finding practice questions and solutions ===
- practiceQuestions = soupReading.findAll('section', id = re.compile("Review"))
- if len(practiceQuestions) >= 1:
- readingFile = open('[Review] %s.html' % title, 'wb')
- for section in practiceQuestions:
- for image_tag in section.findAll('img'):
- imageTagRegex = re.compile(r'(\.\.\/)?(images.*)')
- trueImageTag = imageTagRegex.search(image_tag['src']).group(2)
- image_tag['src'] = 'https://jigsaw.vitalsource.com/books/' + str(book['isbn']) + '/epub/OEBPS/' + trueImageTag
- readingFile.write(section.encode('UTF-8'))
- readingFile.close()
- else:
- continue
- print('Done! Enjoy!')
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement