Untitled

#! Python
# Goes to bookshelf and downloads blue box questions from the CFA textbooks

import requests, json, re, os
from bs4 import BeautifulSoup

username = input('Please type in your E-mail: ')
password = input('Please type in your password: ')
Year = input('What year of books do you want?: ')
Level = input('What Level? (I,II,III): ')

# Provide username and password
payload = {
    'user[email]': username,
    'user[password]': password
}

# Creating a folder for the documents
startingLocation = os.path.abspath(os.curdir)
location = startingLocation + '\CFACurriculumProblems'
if not os.path.exists(location):
    os.makedirs(location)
os.chdir(location)
print('ALL FILES WILL BE SAVED IN: %s' % location)

# Logging in and keeping a session open
with requests.Session() as s:
    p = s.post('https://jigsaw.vitalsource.com/login', params=payload)
    Library = p.text
    parsed_Library = json.loads(Library)['books']
    my_regex = re.compile(str(Year) + r'\sCFA\sLevel\s' + Level + r'\s')
    isbnList = []
    # grabbing all the books to read
    for book in parsed_Library:
        title = book['title']
        isbn = book['isbn']
        if my_regex.search(title):
            isbnList.append({'title':title, 'isbn':int(isbn)})
    # Going to each book's table of contents first
    for book in isbnList:
        bookUrl = 'https://jigsaw.vitalsource.com/books/' + str(book['isbn']) + '/toc'
        tableOfContents = s.get(bookUrl).text
        parsed_tableOfContents = json.loads(tableOfContents)
        readingList = []
        readingRegex = re.compile(r'Reading\s{1,10}\d{1,3}')
        newLocation = startingLocation + '\CFACurriculumProblems' + r'\ ' + book['title']
        if not os.path.exists(newLocation):
            os.makedirs(newLocation)
        os.chdir(newLocation)
        # Grabbing the links to all of the readings
        for reading in parsed_tableOfContents:
            if readingRegex.search(reading['title']):
                readingList.append(reading)
        for subReading in readingList:
            newLink = subReading['path']
            newURL = 'https://jigsaw.vitalsource.com/books/' + str(book['isbn']) + '/epub' + str(newLink)
            reading = s.get(newURL).text
            soupReading = BeautifulSoup(reading, "html.parser")
            for span_tag in soupReading.findAll('span'):
                span_tag.replace_with('')
            title = subReading['title']
            title = " ".join(re.findall("[a-zA-Z0-9.]+", title))
            print('Now Scraping Reading: %s' % title)

            # === MODIFICATIONS: Creating dump file for analyzing the page source code ===
            # dumpingFile = open('dump of %s.html' % title, 'wb')
            # dumpingFile.write(soupReading.encode('UTF-8'))

            # Grabbing all of the blue boxes which are denoted by "figure" in the HTML
            # Trying to also save them off as word documents, with both text and tables
            figures = soupReading.findAll("figure", {"class": "example"})
            if len(figures) >= 1:
                readingFile = open('[BlueBoxes] %s.html' % title, 'wb')
                for figure in figures:
                    for image_tag in figure.findAll('img'):
                        imageTagRegex = re.compile(r'(\.\.\/)?(images.*)')
                        trueImageTag = imageTagRegex.search(image_tag['src']).group(2)
                        image_tag['src'] = 'https://jigsaw.vitalsource.com/books/' + str(book['isbn']) + '/epub/OEBPS/' + trueImageTag
                    readingFile.write(figure.encode('UTF-8'))
                readingFile.close()
            else:
                continue

            # This part basically mimics the original code.
            # === MODIFICATIONS: Finding practice questions and solutions ===
            practiceQuestions = soupReading.findAll('section', id = re.compile("Review"))
            if len(practiceQuestions) >= 1:
                readingFile = open('[Review] %s.html' % title, 'wb')
                for section in practiceQuestions:
                    for image_tag in section.findAll('img'):
                        imageTagRegex = re.compile(r'(\.\.\/)?(images.*)')
                        trueImageTag = imageTagRegex.search(image_tag['src']).group(2)
                        image_tag['src'] = 'https://jigsaw.vitalsource.com/books/' + str(book['isbn']) + '/epub/OEBPS/' + trueImageTag
                    readingFile.write(section.encode('UTF-8'))
                readingFile.close()
            else:
                continue
    print('Done! Enjoy!')