Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import datetime
- import mysql.connector
- from bs4 import BeautifulSoup
- from selenium import webdriver
- import os
- import sys
- import time
- import re
- import traceback
- import json
- from selenium.webdriver.support.wait import WebDriverWait
- from selenium.webdriver.support import expected_conditions as EC
- from selenium.webdriver.common.by import By
- #http://dev.mysql.com/doc/connector-python/en/connector-python-examples.html
- def replace_all(my_string,change_this,to_this):
- while change_this in my_string:
- my_string = my_string.replace(change_this,to_this)
- return my_string
- #Finds the index of the n'th character in a string
- def index_of_nth(my_string,find_string,n):
- try:
- splits = my_string.split('find_string')
- index_count = 0
- for x in range(0,n - 1):
- split = splits[x]
- index_count += len(split) + len(find_string)
- return index_count
- except:
- return -1
- def write_to_file(file_name,string_data):
- file = open(file_name,'w+')
- file.write(string_data)
- file.close()
- main_data_dict = {'schools':[]}
- def main():
- global main_data_dict
- #open db connection
- #connection = mysql.connector.connect(user='root', database='urlinq_new')
- #cursor = connection.cursor(dictionary=True)
- #query = ('SELECT * FROM event')
- #cursor.execute(query)
- #for event in cursor:
- #print event
- # pass
- #cursor.close()
- #Start selenium web driver
- driver_name = 'chromedriver.exe'
- #Determine if application is a script file or frozen exe
- if getattr(sys, 'frozen', False):
- application_path = os.path.dirname(sys.executable)
- elif __file__:
- application_path = os.path.dirname(__file__)
- exe_path = os.path.join(application_path, driver_name)
- #browser = webdriver.Chrome(executable_path=exe_path)
- browser = webdriver.Firefox()
- #fp = webdriver.FirefoxProfile()
- #fp.set_preference("webdriver.load.strategy", "unstable")
- #browser = webdriver.Firefox(firefox_profile=fp)
- #browser = webdriver.PhantomJS()
- #browser = webdriver.PhantomJS()
- #browser.implicitly_wait(20) # seconds
- browser.get('https://admin.portal.nyu.edu/psp/paprod/EMPLOYEE/EMPL/h/?tab')
- time.sleep(3)
- user_id_input = browser.find_element_by_id('userid')
- user_id_input.send_keys('your_netid')
- password_input = browser.find_element_by_id('pwd')
- password_input.send_keys('your_password')
- password_input.submit()
- time.sleep(5)
- #Get the link from the student resources page
- student_center_soup = BeautifulSoup(browser.page_source)
- student_center_div = student_center_soup.find('div',{'id':'student_center_wsq'})
- student_center_a_element = student_center_div.find("a")
- student_center_link = student_center_a_element['href']
- student_center_link = student_center_link[2:-1]
- #print student_center_link
- #Take browser to student center_link
- browser.get("https://admin.portal.nyu.edu/psp/paprod/EMPLOYEE/EMPL" + student_center_link)
- #Let the page load
- time.sleep(5)
- browser.get('https://admin.portal.nyu.edu/psp/paprod/EMPLOYEE/CSSS/c/SA_LEARNER_SERVICES.SSS_STUDENT_CENTER.GBL?FolderPath=PORTAL_ROOT_OBJECT.NYU_STUDENT_CTR&IsFolder=false&IgnoreParamTempl=FolderPath%2cIsFolder')
- #Switch the the stupid ass frame
- browser.switch_to.default_content()
- content_frame = browser.find_element_by_name('TargetContent')
- browser.switch_to.frame(content_frame)
- search_for_classes_link = browser.find_element_by_id('DERIVED_SSS_SCR_SSS_LINK_ANCHOR2')
- search_for_classes_link.click()
- time.sleep(4)
- #Check the "spring 2015" checkbox
- spring_checkbox = browser.find_element_by_id('NYU_CLS_WRK_NYU_SPRING')
- spring_checkbox.click()
- time.sleep(4)
- #Loop through the table elements on the page class: SSSGROUPBOXLEFTWBO
- #Each table contains the school name with class: SSSGROUPBOXLEFTLABEL
- #and the departments for that school in class: SSSAZLINK
- school_tables = browser.find_elements_by_class_name('SSSGROUPBOXLEFTWBO')
- while len(school_tables) <= 0:
- school_tables = browser.find_elements_by_class_name('SSSGROUPBOXLEFTWBO')
- time.sleep(1)
- print "Waiting for school table elements."
- print "Length of school_tables: " + str(school_tables)
- for school_index in range(12,len(school_tables)):
- try:
- #RECLICK THE SPRING 2015 BUTTON
- #Check the "spring 2015" checkbox
- #print "CHECK BOX STATUS - SCHOOL PAGE"
- spring_checkbox = browser.find_element_by_id('NYU_CLS_WRK_NYU_SPRING')
- checkbox_checked = spring_checkbox.is_selected()
- #print checkbox_checked
- if not checkbox_checked:
- print "CHECKING THE FUCKING CHECKBOX - SCHPOOL"
- spring_checkbox.click()
- time.sleep(4)
- #Refind the school_tables because after going through a departments courses/classes,
- #we then come back to this page where the dom elements in the original school_tables
- #are not "attatched" to the current document anymore
- school_tables = browser.find_elements_by_class_name('SSSGROUPBOXLEFTWBO')
- while len(school_tables) <= 0:
- school_tables = browser.find_elements_by_class_name('SSSGROUPBOXLEFTWBO')
- time.sleep(1)
- print "Waiting for school table elements. in school loop"
- print "Length of school_tables: " + str(school_tables)
- school_table = school_tables[school_index]
- #Pull the school name
- school_name = school_table.find_elements_by_class_name('SSSGROUPBOXLEFTLABEL')[0].text
- #Remove the first and last space
- school_name = school_name[1:-1]
- #remove ' - Graduate' or ' - Undergraduate'
- school_name.replace(' - Graduate','')
- school_name.replace(' - Undergraduate','')
- data_dict = {'school_name':school_name,'departments':[]}
- departments_holder = school_table.find_element_by_class_name('SSSGROUPBOXLEFT')
- #Get all the department links SSSAZLINK
- departments = departments_holder.find_elements_by_tag_name('a')
- while len(departments) <= 0:
- departments = school_table.departments_holder.find_elements_by_tag_name('a')
- time.sleep(1)
- print "Waiting department elements"
- print "Length of departments: " + str(departments)
- print "school: " + school_name
- for department_index in range(0,len(departments)):
- department_dict = {'courses':[]}
- #RECLICK THE SPRING 2015 BUTTON
- #Check the "spring 2015" checkbox
- #if it is NOT checked
- spring_checkbox = browser.find_element_by_id('NYU_CLS_WRK_NYU_SPRING')
- #print "CHECK BOX STATUS - IN DEPARTMENTS LOOP"
- checkbox_checked = spring_checkbox.is_selected()
- #print checkbox_checked
- if not checkbox_checked:
- print "CHECKING THE FUCKING CHECKBOX - DEP"
- spring_checkbox.click()
- time.sleep(4)
- #Repeat all these steps so the elements are
- #forsure attatched to the DOM
- school_tables = browser.find_elements_by_class_name('SSSGROUPBOXLEFTWBO')
- while len(school_tables) <= 0:
- school_tables = browser.find_elements_by_class_name('SSSGROUPBOXLEFTWBO')
- time.sleep(1)
- print "Waiting for school table elements. In department loop"
- print "Length of school_tables: " + str(school_tables)
- school_table = school_tables[school_index]
- departments_holder = school_table.find_element_by_class_name('SSSGROUPBOXLEFT')
- departments = departments_holder.find_elements_by_tag_name('a')
- department = departments[department_index]
- department_name = department.text
- #department_name looks like this: Ctr for Urban Sci and Progress (CUSP-GX)
- #so we need to remove the (text)
- department_name = department_name[0:department_name.index('(') - 1]
- department_name = replace_all(department_name,'\n',' ')
- department_dict['department_name'] = department_name
- #Go to the courses for this department
- department.click()
- time.sleep(4)
- #Get all the courses and loop through
- courses = browser.find_elements_by_class_name('PSGROUPBOXWBO')
- continue_loop = False
- while len(courses) <= 0:
- courses = browser.find_elements_by_class_name('PSGROUPBOXWBO')
- time.sleep(1)
- print "Waiting for courses elements"
- print "Length of courses: " + str(school_tables)
- #check if this department simply as no courses
- try:
- class_count_span_text = browser.find_element_by_id("NYU_CLS_WRK_DESCR100").text
- if 'Total Class Count: 0' in class_count_span_text:
- print "This department doesnt have any fucking courses"
- continue_loop = True
- break
- except:
- print "class count page title not an element"
- if continue_loop:
- clicked = False
- while not clicked:
- try:
- browser.find_element_by_id('NYU_CLS_DERIVED_BACK').click()
- clicked = True
- print "SUCCESSFULLY CLICKED THE BACK BUTTON"
- except:
- print "COULD NOT CLICK THE BACK BUTTON. TRYING AGAIN"
- print "department_name: " + department_name
- time.sleep(1)
- #If this department doesnt have any courses, lets just go back to school page
- continue
- if len(courses) > 0:
- time.sleep(1)
- for course_index in range(0,len(courses)):
- try:
- course_dict = {'classes':[]}
- courses = browser.find_elements_by_class_name('PSGROUPBOXWBO')
- course = courses[course_index]
- try:
- #Try to click the "show more description link if it exists
- show_more_description_link = course.find_element_by_xpath(".//a[contains(@href,'#')]")
- show_more_description_link.click()
- time.sleep(4)
- except:
- pass
- #re get the course because selenium is fucking retarded
- courses = browser.find_elements_by_class_name('PSGROUPBOXWBO')
- while len(courses) <= 0:
- school_tables = browser.find_elements_by_class_name('SSSGROUPBOXLEFTWBO')
- time.sleep(1)
- print "Waiting for school table elements."
- print "Length of school_tables: " + str(school_tables)
- course = courses[course_index]
- span_element = course.find_element_by_tag_name('span')
- course_text = span_element.text
- course_name_b_element_text = ''
- active = False
- while not active:
- try:
- course_name_b_element = span_element.find_element_by_tag_name('b')
- course_name_b_element_text = course_name_b_element.text
- active = True
- except:
- time.sleep(1)
- print "ERROR GETTING THE COURSE NAME B ELEMENT"
- print "TRYING AGAIn"
- pass
- #Course name is close to the start of course_text
- #and is strucutured like this:
- #CUSP-GX 6003 Innovation and Entrepreneurship for Urban Technologies | Innovation in Complex Urban Systems\n
- #We only want the "Innovation and..." part for the course_name
- #CUSP-GX 6003 is the course_tag
- second_space_index = 0
- #find the second space index. The space inbetween 6003 and Innovation
- space_count = 0
- for space_index in range(0,len(course_name_b_element_text)):
- character = course_text[space_index]
- if character == ' ':
- space_count += 1
- if space_count == 2:
- second_space_index = space_index
- break
- #Get the index of the first \n
- #It seperats the course_name from the course_description
- try:
- first_new_line_index = course_text.index('\n')
- except:
- first_new_line_index = -1
- #course_tag = course_text[0:second_space_index]
- course_tag = course_name_b_element.text[0:second_space_index]
- #course_name = course_text[second_space_index + 1:first_new_line_index]
- course_name = replace_all(course_name_b_element_text,'\n',' ')
- course_name = course_name.replace(course_tag + ' ','')
- #If "less description for" is not in the string,
- #then we want to just read until the end of the file (-1)
- end_of_description_index = 0
- try:
- end_of_description_index = course_text.index('less description for')
- except:
- end_of_description_index = len(course_text)
- course_description = course_text[first_new_line_index + 1: end_of_description_index]
- course_description = course_description.replace(course_tag + ' ','')
- course_description = replace_all(course_description,'\n',' ')
- course_dict['course_tag'] = course_tag
- course_dict['course_name'] = course_name
- course_dict['course_description'] = course_description
- #print "COURSE NAME"
- #print course_name_string
- #print "end course name string"
- #Make the drop down appear for classes
- course.find_element_by_class_name('PSHYPERLINK').click()
- time.sleep(3)
- #refind the course element after that click
- courses = browser.find_elements_by_class_name('PSGROUPBOXWBO')
- course = courses[course_index]
- #classes = course.find_elements_by_class_name('PSLEVEL3SCROLLAREABODY')
- #print "COURSE HTML"
- #print course.get_attribute('innerHTML')
- #.// get
- classes = course.find_elements_by_xpath(".//div[contains(@id, 'win0divNYU_CLS_DERIVED_HTMLAREA3')]")
- while len(classes) <= 0:
- classes = course.find_elements_by_xpath(".//div[contains(@id, 'win0divNYU_CLS_DERIVED_HTMLAREA3')]")
- time.sleep(1)
- print "Waiting for classes to show up for course " + course_name
- print "Length of classes: " + str(classes)
- #Skip the first one
- for class_box in classes:
- try:
- class_dict = {}
- class_box_text = class_box.text
- class_box_text_parts = class_box_text.split('|')
- class_credits = 0
- #Find the section that contains the credits for this class (if any)
- for class_box_text_part in class_box_text_parts:
- if 'units' in class_box_text_part:
- class_credits = class_box_text_part[1:class_box_text_part.index('units') - 1]
- break
- class_number = ''
- #find the section that contains class#
- for class_box_text_part in class_box_text_parts:
- if 'Class#' in class_box_text_part:
- class_number = class_box_text_part[class_box_text_part.index('Class#') + 8:len(class_box_text_part) - 1]
- break
- #get the last part which usually contains something like this:
- # | Component: Seminar\n01/26/2015 - 05/11/2015 Thu 3.30 PM - 6.10 PM at TISC LC2
- # with Lukes, Steven\nNotes: Open to sophomores and higher. CAS students register first;
- # students from other schools can register on Friday, November 21. Sociology majors can enroll
- # under SOC-UA 935.001.
- last_class_text_part = class_box_text_parts[-1]
- class_location = ''
- class_professor_name = ''
- #Look for 'at ' with a space to ensure that 'at' is its own word,
- #not in another string like 'status'
- if 'at ' in last_class_text_part and 'with ' in last_class_text_part:
- at_index = last_class_text_part.index('at ')
- with_index = last_class_text_part.index('with ')
- class_location = last_class_text_part[at_index + 3:with_index - 1]
- with_till_end_string = last_class_text_part[with_index: len(last_class_text_part)]
- try:
- next_new_line_index = with_till_end_string.index('\n')
- except:
- next_new_line_index = len(with_till_end_string)
- #Find the next line after this
- class_professor_name = with_till_end_string[5:next_new_line_index]
- #If there are multiple professors, they will be split with a ;
- #We just want the first one for now to simplify this shit
- class_professor_name = class_professor_name.split(';')[0]
- elif 'with ' in last_class_text_part:
- with_index = last_class_text_part.index('with ')
- with_till_end_string = last_class_text_part[with_index: len(last_class_text_part)]
- try:
- next_new_line_index = with_till_end_string.index('\n')
- except:
- next_new_line_index = len(with_till_end_string)
- #Find the next line after this
- class_professor_name = with_till_end_string[5:next_new_line_index]
- #If there are multiple professors, they will be split with a ;
- #We just want the first one for now to simplify this shit
- class_professor_name = class_professor_name.split(';')[0]
- days_of_week = ['Mon','Tue','Wed','Thu','Fri','Sat','Sun']
- class_datetime_string = ''
- #extract the date/s and time range for this class
- for day_of_week in days_of_week:
- first_day_index = 0
- if day_of_week in last_class_text_part:
- first_day_index = last_class_text_part.index(day_of_week)
- end_of_class_datetime_index = 0
- first_am_or_pm_index = 0
- if 'AM ' in last_class_text_part:
- first_am_or_pm_index = last_class_text_part.index('AM ')
- elif 'PM ' in last_class_text_part:
- first_am_or_pm_index = last_class_text_part.index('PM ')
- #cut last_class_text_part starting at first_am_or_pm_string until the end
- sub_string = last_class_text_part[first_am_or_pm_index + 2:-1]
- #find the index of the next AM or PM in the sub string
- if 'AM ' in sub_string:
- end_of_class_datetime_index = sub_string.index('AM ')
- elif 'PM ' in sub_string:
- end_of_class_datetime_index = sub_string.index('PM ')
- #Add the first AM or PM index to the end, because we want it to be
- #the index of the second AM or PM in last_class_text_part
- end_of_class_datetime_index += first_am_or_pm_index
- #add two to set the index after the AM or PM
- end_of_class_datetime_index += 4
- class_datetime_string = last_class_text_part[first_day_index:end_of_class_datetime_index]
- #Stop the forloop so we dont skip the first day
- break
- class_dict['class_name'] = class_box_text
- class_dict['class_credits'] = class_credits
- class_dict['class_number'] = class_number
- class_dict['class_location'] = class_location
- class_dict['class_professor_name'] = class_professor_name
- class_dict['class_datetime'] = class_datetime_string
- #print "CLASS CREDITS"
- #print class_credits
- #print 'CLASS NUMBER'
- #print class_number
- course_dict['classes'].append(class_dict)
- except:
- print "Error getting class in course " + course_name
- print traceback.print_exc()
- print '---------------------------------------------'
- course_dict['classes'].append({'class_name':'ERROR'})
- department_dict['courses'].append(course_dict)
- except:
- print "Error grabbing courses from school " + school_name + " and department " + department_name
- print traceback.print_exc()
- course_dict = {'course_name':'ERROR'}
- department_dict['courses'].append(course_dict)
- data_dict['departments'].append(department_dict)
- print json.dumps(data_dict)
- #after we are done extracting data from all courses/classes, go back to the
- #school page
- clicked = False
- while not clicked:
- try:
- browser.find_element_by_id('NYU_CLS_DERIVED_BACK').click()
- clicked = True
- print "SUCCESSFULLY CLICKED THE BACK BUTTON"
- except:
- print "COULD NOT CLICK THE BACK BUTTON. TRYING AGAIN"
- print "department_name: " + department_name
- time.sleep(1)
- time.sleep(10)
- main_data_dict['schools'].append(data_dict)
- print "Writing main JSON to data.txt"
- write_to_file('data.txt',json.dumps(main_data_dict))
- except:
- print 'ERROR LOOPING THROUGH SCHOOL'
- print traceback.print_exc()
- time.sleep(10)
- #close selenium browser
- browser.close()
- #connection.close()
- main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement