Advertisement
Guest User

Untitled

a guest
Dec 25th, 2014
275
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 26.63 KB | None | 0 0
  1. import datetime
  2. import mysql.connector
  3. from bs4 import BeautifulSoup
  4. from selenium import webdriver
  5. import os
  6. import sys
  7. import time
  8. import re
  9. import traceback
  10. import json
  11.  
  12. from selenium.webdriver.support.wait import WebDriverWait
  13. from selenium.webdriver.support import expected_conditions as EC
  14. from selenium.webdriver.common.by import By
  15.  
  16. #http://dev.mysql.com/doc/connector-python/en/connector-python-examples.html
  17.  
  18. def replace_all(my_string,change_this,to_this):
  19. while change_this in my_string:
  20. my_string = my_string.replace(change_this,to_this)
  21. return my_string
  22.  
  23. #Finds the index of the n'th character in a string
  24. def index_of_nth(my_string,find_string,n):
  25. try:
  26. splits = my_string.split('find_string')
  27. index_count = 0
  28. for x in range(0,n - 1):
  29. split = splits[x]
  30. index_count += len(split) + len(find_string)
  31. return index_count
  32. except:
  33. return -1
  34.  
  35.  
  36. def write_to_file(file_name,string_data):
  37. file = open(file_name,'w+')
  38. file.write(string_data)
  39. file.close()
  40.  
  41. main_data_dict = {'schools':[]}
  42.  
  43. def main():
  44. global main_data_dict
  45. #open db connection
  46. #connection = mysql.connector.connect(user='root', database='urlinq_new')
  47. #cursor = connection.cursor(dictionary=True)
  48.  
  49. #query = ('SELECT * FROM event')
  50. #cursor.execute(query)
  51.  
  52. #for event in cursor:
  53. #print event
  54. # pass
  55.  
  56. #cursor.close()
  57.  
  58.  
  59. #Start selenium web driver
  60. driver_name = 'chromedriver.exe'
  61. #Determine if application is a script file or frozen exe
  62. if getattr(sys, 'frozen', False):
  63. application_path = os.path.dirname(sys.executable)
  64. elif __file__:
  65. application_path = os.path.dirname(__file__)
  66. exe_path = os.path.join(application_path, driver_name)
  67.  
  68. #browser = webdriver.Chrome(executable_path=exe_path)
  69.  
  70. browser = webdriver.Firefox()
  71.  
  72. #fp = webdriver.FirefoxProfile()
  73. #fp.set_preference("webdriver.load.strategy", "unstable")
  74.  
  75.  
  76. #browser = webdriver.Firefox(firefox_profile=fp)
  77. #browser = webdriver.PhantomJS()
  78. #browser = webdriver.PhantomJS()
  79.  
  80.  
  81.  
  82.  
  83.  
  84. #browser.implicitly_wait(20) # seconds
  85.  
  86. browser.get('https://admin.portal.nyu.edu/psp/paprod/EMPLOYEE/EMPL/h/?tab')
  87. time.sleep(3)
  88.  
  89. user_id_input = browser.find_element_by_id('userid')
  90. user_id_input.send_keys('your_netid')
  91.  
  92. password_input = browser.find_element_by_id('pwd')
  93. password_input.send_keys('your_password')
  94. password_input.submit()
  95.  
  96. time.sleep(5)
  97.  
  98. #Get the link from the student resources page
  99. student_center_soup = BeautifulSoup(browser.page_source)
  100. student_center_div = student_center_soup.find('div',{'id':'student_center_wsq'})
  101. student_center_a_element = student_center_div.find("a")
  102. student_center_link = student_center_a_element['href']
  103. student_center_link = student_center_link[2:-1]
  104. #print student_center_link
  105. #Take browser to student center_link
  106. browser.get("https://admin.portal.nyu.edu/psp/paprod/EMPLOYEE/EMPL" + student_center_link)
  107. #Let the page load
  108. time.sleep(5)
  109.  
  110.  
  111. browser.get('https://admin.portal.nyu.edu/psp/paprod/EMPLOYEE/CSSS/c/SA_LEARNER_SERVICES.SSS_STUDENT_CENTER.GBL?FolderPath=PORTAL_ROOT_OBJECT.NYU_STUDENT_CTR&IsFolder=false&IgnoreParamTempl=FolderPath%2cIsFolder')
  112.  
  113.  
  114. #Switch the the stupid ass frame
  115. browser.switch_to.default_content()
  116. content_frame = browser.find_element_by_name('TargetContent')
  117. browser.switch_to.frame(content_frame)
  118.  
  119.  
  120. search_for_classes_link = browser.find_element_by_id('DERIVED_SSS_SCR_SSS_LINK_ANCHOR2')
  121. search_for_classes_link.click()
  122.  
  123. time.sleep(4)
  124.  
  125. #Check the "spring 2015" checkbox
  126.  
  127. spring_checkbox = browser.find_element_by_id('NYU_CLS_WRK_NYU_SPRING')
  128. spring_checkbox.click()
  129.  
  130. time.sleep(4)
  131.  
  132. #Loop through the table elements on the page class: SSSGROUPBOXLEFTWBO
  133. #Each table contains the school name with class: SSSGROUPBOXLEFTLABEL
  134. #and the departments for that school in class: SSSAZLINK
  135.  
  136. school_tables = browser.find_elements_by_class_name('SSSGROUPBOXLEFTWBO')
  137. while len(school_tables) <= 0:
  138. school_tables = browser.find_elements_by_class_name('SSSGROUPBOXLEFTWBO')
  139. time.sleep(1)
  140. print "Waiting for school table elements."
  141. print "Length of school_tables: " + str(school_tables)
  142.  
  143.  
  144. for school_index in range(12,len(school_tables)):
  145.  
  146. try:
  147.  
  148.  
  149. #RECLICK THE SPRING 2015 BUTTON
  150. #Check the "spring 2015" checkbox
  151. #print "CHECK BOX STATUS - SCHOOL PAGE"
  152.  
  153.  
  154. spring_checkbox = browser.find_element_by_id('NYU_CLS_WRK_NYU_SPRING')
  155. checkbox_checked = spring_checkbox.is_selected()
  156. #print checkbox_checked
  157.  
  158. if not checkbox_checked:
  159. print "CHECKING THE FUCKING CHECKBOX - SCHPOOL"
  160. spring_checkbox.click()
  161. time.sleep(4)
  162.  
  163.  
  164.  
  165. #Refind the school_tables because after going through a departments courses/classes,
  166. #we then come back to this page where the dom elements in the original school_tables
  167. #are not "attatched" to the current document anymore
  168.  
  169. school_tables = browser.find_elements_by_class_name('SSSGROUPBOXLEFTWBO')
  170. while len(school_tables) <= 0:
  171. school_tables = browser.find_elements_by_class_name('SSSGROUPBOXLEFTWBO')
  172. time.sleep(1)
  173. print "Waiting for school table elements. in school loop"
  174. print "Length of school_tables: " + str(school_tables)
  175.  
  176.  
  177. school_table = school_tables[school_index]
  178.  
  179.  
  180. #Pull the school name
  181.  
  182. school_name = school_table.find_elements_by_class_name('SSSGROUPBOXLEFTLABEL')[0].text
  183. #Remove the first and last space
  184. school_name = school_name[1:-1]
  185. #remove ' - Graduate' or ' - Undergraduate'
  186. school_name.replace(' - Graduate','')
  187. school_name.replace(' - Undergraduate','')
  188. data_dict = {'school_name':school_name,'departments':[]}
  189.  
  190.  
  191.  
  192. departments_holder = school_table.find_element_by_class_name('SSSGROUPBOXLEFT')
  193.  
  194.  
  195. #Get all the department links SSSAZLINK
  196. departments = departments_holder.find_elements_by_tag_name('a')
  197.  
  198.  
  199. while len(departments) <= 0:
  200. departments = school_table.departments_holder.find_elements_by_tag_name('a')
  201. time.sleep(1)
  202. print "Waiting department elements"
  203. print "Length of departments: " + str(departments)
  204. print "school: " + school_name
  205.  
  206. for department_index in range(0,len(departments)):
  207. department_dict = {'courses':[]}
  208. #RECLICK THE SPRING 2015 BUTTON
  209. #Check the "spring 2015" checkbox
  210. #if it is NOT checked
  211. spring_checkbox = browser.find_element_by_id('NYU_CLS_WRK_NYU_SPRING')
  212. #print "CHECK BOX STATUS - IN DEPARTMENTS LOOP"
  213. checkbox_checked = spring_checkbox.is_selected()
  214. #print checkbox_checked
  215.  
  216.  
  217. if not checkbox_checked:
  218. print "CHECKING THE FUCKING CHECKBOX - DEP"
  219. spring_checkbox.click()
  220. time.sleep(4)
  221.  
  222.  
  223. #Repeat all these steps so the elements are
  224. #forsure attatched to the DOM
  225.  
  226. school_tables = browser.find_elements_by_class_name('SSSGROUPBOXLEFTWBO')
  227. while len(school_tables) <= 0:
  228. school_tables = browser.find_elements_by_class_name('SSSGROUPBOXLEFTWBO')
  229. time.sleep(1)
  230. print "Waiting for school table elements. In department loop"
  231. print "Length of school_tables: " + str(school_tables)
  232.  
  233. school_table = school_tables[school_index]
  234.  
  235. departments_holder = school_table.find_element_by_class_name('SSSGROUPBOXLEFT')
  236. departments = departments_holder.find_elements_by_tag_name('a')
  237. department = departments[department_index]
  238.  
  239. department_name = department.text
  240. #department_name looks like this: Ctr for Urban Sci and Progress (CUSP-GX)
  241. #so we need to remove the (text)
  242. department_name = department_name[0:department_name.index('(') - 1]
  243. department_name = replace_all(department_name,'\n',' ')
  244.  
  245. department_dict['department_name'] = department_name
  246. #Go to the courses for this department
  247. department.click()
  248.  
  249. time.sleep(4)
  250. #Get all the courses and loop through
  251.  
  252. courses = browser.find_elements_by_class_name('PSGROUPBOXWBO')
  253. continue_loop = False
  254. while len(courses) <= 0:
  255. courses = browser.find_elements_by_class_name('PSGROUPBOXWBO')
  256. time.sleep(1)
  257. print "Waiting for courses elements"
  258. print "Length of courses: " + str(school_tables)
  259.  
  260. #check if this department simply as no courses
  261. try:
  262. class_count_span_text = browser.find_element_by_id("NYU_CLS_WRK_DESCR100").text
  263. if 'Total Class Count: 0' in class_count_span_text:
  264. print "This department doesnt have any fucking courses"
  265. continue_loop = True
  266. break
  267. except:
  268. print "class count page title not an element"
  269.  
  270. if continue_loop:
  271. clicked = False
  272. while not clicked:
  273. try:
  274. browser.find_element_by_id('NYU_CLS_DERIVED_BACK').click()
  275. clicked = True
  276. print "SUCCESSFULLY CLICKED THE BACK BUTTON"
  277. except:
  278. print "COULD NOT CLICK THE BACK BUTTON. TRYING AGAIN"
  279. print "department_name: " + department_name
  280. time.sleep(1)
  281. #If this department doesnt have any courses, lets just go back to school page
  282. continue
  283.  
  284. if len(courses) > 0:
  285. time.sleep(1)
  286. for course_index in range(0,len(courses)):
  287. try:
  288. course_dict = {'classes':[]}
  289.  
  290. courses = browser.find_elements_by_class_name('PSGROUPBOXWBO')
  291. course = courses[course_index]
  292. try:
  293. #Try to click the "show more description link if it exists
  294. show_more_description_link = course.find_element_by_xpath(".//a[contains(@href,'#')]")
  295. show_more_description_link.click()
  296. time.sleep(4)
  297. except:
  298. pass
  299.  
  300.  
  301.  
  302. #re get the course because selenium is fucking retarded
  303. courses = browser.find_elements_by_class_name('PSGROUPBOXWBO')
  304.  
  305. while len(courses) <= 0:
  306. school_tables = browser.find_elements_by_class_name('SSSGROUPBOXLEFTWBO')
  307. time.sleep(1)
  308. print "Waiting for school table elements."
  309. print "Length of school_tables: " + str(school_tables)
  310.  
  311. course = courses[course_index]
  312.  
  313.  
  314.  
  315.  
  316. span_element = course.find_element_by_tag_name('span')
  317.  
  318.  
  319.  
  320. course_text = span_element.text
  321.  
  322.  
  323. course_name_b_element_text = ''
  324.  
  325. active = False
  326. while not active:
  327. try:
  328. course_name_b_element = span_element.find_element_by_tag_name('b')
  329. course_name_b_element_text = course_name_b_element.text
  330. active = True
  331. except:
  332. time.sleep(1)
  333. print "ERROR GETTING THE COURSE NAME B ELEMENT"
  334. print "TRYING AGAIn"
  335. pass
  336.  
  337.  
  338.  
  339.  
  340.  
  341.  
  342. #Course name is close to the start of course_text
  343. #and is strucutured like this:
  344. #CUSP-GX 6003 Innovation and Entrepreneurship for Urban Technologies | Innovation in Complex Urban Systems\n
  345. #We only want the "Innovation and..." part for the course_name
  346. #CUSP-GX 6003 is the course_tag
  347. second_space_index = 0
  348. #find the second space index. The space inbetween 6003 and Innovation
  349. space_count = 0
  350. for space_index in range(0,len(course_name_b_element_text)):
  351. character = course_text[space_index]
  352. if character == ' ':
  353. space_count += 1
  354. if space_count == 2:
  355. second_space_index = space_index
  356. break
  357.  
  358. #Get the index of the first \n
  359. #It seperats the course_name from the course_description
  360. try:
  361. first_new_line_index = course_text.index('\n')
  362. except:
  363. first_new_line_index = -1
  364.  
  365.  
  366.  
  367. #course_tag = course_text[0:second_space_index]
  368. course_tag = course_name_b_element.text[0:second_space_index]
  369. #course_name = course_text[second_space_index + 1:first_new_line_index]
  370. course_name = replace_all(course_name_b_element_text,'\n',' ')
  371. course_name = course_name.replace(course_tag + ' ','')
  372.  
  373.  
  374. #If "less description for" is not in the string,
  375. #then we want to just read until the end of the file (-1)
  376. end_of_description_index = 0
  377. try:
  378. end_of_description_index = course_text.index('less description for')
  379. except:
  380. end_of_description_index = len(course_text)
  381. course_description = course_text[first_new_line_index + 1: end_of_description_index]
  382. course_description = course_description.replace(course_tag + ' ','')
  383. course_description = replace_all(course_description,'\n',' ')
  384.  
  385.  
  386. course_dict['course_tag'] = course_tag
  387. course_dict['course_name'] = course_name
  388. course_dict['course_description'] = course_description
  389. #print "COURSE NAME"
  390. #print course_name_string
  391. #print "end course name string"
  392.  
  393.  
  394.  
  395.  
  396.  
  397. #Make the drop down appear for classes
  398. course.find_element_by_class_name('PSHYPERLINK').click()
  399. time.sleep(3)
  400.  
  401. #refind the course element after that click
  402. courses = browser.find_elements_by_class_name('PSGROUPBOXWBO')
  403. course = courses[course_index]
  404.  
  405. #classes = course.find_elements_by_class_name('PSLEVEL3SCROLLAREABODY')
  406. #print "COURSE HTML"
  407. #print course.get_attribute('innerHTML')
  408.  
  409. #.// get
  410. classes = course.find_elements_by_xpath(".//div[contains(@id, 'win0divNYU_CLS_DERIVED_HTMLAREA3')]")
  411. while len(classes) <= 0:
  412. classes = course.find_elements_by_xpath(".//div[contains(@id, 'win0divNYU_CLS_DERIVED_HTMLAREA3')]")
  413. time.sleep(1)
  414. print "Waiting for classes to show up for course " + course_name
  415. print "Length of classes: " + str(classes)
  416.  
  417. #Skip the first one
  418. for class_box in classes:
  419. try:
  420. class_dict = {}
  421. class_box_text = class_box.text
  422. class_box_text_parts = class_box_text.split('|')
  423.  
  424.  
  425. class_credits = 0
  426. #Find the section that contains the credits for this class (if any)
  427. for class_box_text_part in class_box_text_parts:
  428. if 'units' in class_box_text_part:
  429. class_credits = class_box_text_part[1:class_box_text_part.index('units') - 1]
  430. break
  431.  
  432. class_number = ''
  433. #find the section that contains class#
  434. for class_box_text_part in class_box_text_parts:
  435. if 'Class#' in class_box_text_part:
  436. class_number = class_box_text_part[class_box_text_part.index('Class#') + 8:len(class_box_text_part) - 1]
  437. break
  438.  
  439.  
  440. #get the last part which usually contains something like this:
  441. # | Component: Seminar\n01/26/2015 - 05/11/2015 Thu 3.30 PM - 6.10 PM at TISC LC2
  442. # with Lukes, Steven\nNotes: Open to sophomores and higher. CAS students register first;
  443. # students from other schools can register on Friday, November 21. Sociology majors can enroll
  444. # under SOC-UA 935.001.
  445. last_class_text_part = class_box_text_parts[-1]
  446. class_location = ''
  447. class_professor_name = ''
  448. #Look for 'at ' with a space to ensure that 'at' is its own word,
  449. #not in another string like 'status'
  450. if 'at ' in last_class_text_part and 'with ' in last_class_text_part:
  451. at_index = last_class_text_part.index('at ')
  452. with_index = last_class_text_part.index('with ')
  453. class_location = last_class_text_part[at_index + 3:with_index - 1]
  454. with_till_end_string = last_class_text_part[with_index: len(last_class_text_part)]
  455. try:
  456. next_new_line_index = with_till_end_string.index('\n')
  457. except:
  458. next_new_line_index = len(with_till_end_string)
  459.  
  460. #Find the next line after this
  461. class_professor_name = with_till_end_string[5:next_new_line_index]
  462. #If there are multiple professors, they will be split with a ;
  463. #We just want the first one for now to simplify this shit
  464. class_professor_name = class_professor_name.split(';')[0]
  465. elif 'with ' in last_class_text_part:
  466. with_index = last_class_text_part.index('with ')
  467. with_till_end_string = last_class_text_part[with_index: len(last_class_text_part)]
  468. try:
  469. next_new_line_index = with_till_end_string.index('\n')
  470. except:
  471. next_new_line_index = len(with_till_end_string)
  472.  
  473. #Find the next line after this
  474. class_professor_name = with_till_end_string[5:next_new_line_index]
  475. #If there are multiple professors, they will be split with a ;
  476. #We just want the first one for now to simplify this shit
  477. class_professor_name = class_professor_name.split(';')[0]
  478.  
  479. days_of_week = ['Mon','Tue','Wed','Thu','Fri','Sat','Sun']
  480. class_datetime_string = ''
  481. #extract the date/s and time range for this class
  482. for day_of_week in days_of_week:
  483. first_day_index = 0
  484. if day_of_week in last_class_text_part:
  485. first_day_index = last_class_text_part.index(day_of_week)
  486. end_of_class_datetime_index = 0
  487. first_am_or_pm_index = 0
  488. if 'AM ' in last_class_text_part:
  489. first_am_or_pm_index = last_class_text_part.index('AM ')
  490. elif 'PM ' in last_class_text_part:
  491. first_am_or_pm_index = last_class_text_part.index('PM ')
  492.  
  493. #cut last_class_text_part starting at first_am_or_pm_string until the end
  494. sub_string = last_class_text_part[first_am_or_pm_index + 2:-1]
  495. #find the index of the next AM or PM in the sub string
  496. if 'AM ' in sub_string:
  497. end_of_class_datetime_index = sub_string.index('AM ')
  498. elif 'PM ' in sub_string:
  499. end_of_class_datetime_index = sub_string.index('PM ')
  500.  
  501. #Add the first AM or PM index to the end, because we want it to be
  502. #the index of the second AM or PM in last_class_text_part
  503. end_of_class_datetime_index += first_am_or_pm_index
  504. #add two to set the index after the AM or PM
  505. end_of_class_datetime_index += 4
  506.  
  507. class_datetime_string = last_class_text_part[first_day_index:end_of_class_datetime_index]
  508.  
  509.  
  510. #Stop the forloop so we dont skip the first day
  511. break
  512.  
  513.  
  514.  
  515. class_dict['class_name'] = class_box_text
  516. class_dict['class_credits'] = class_credits
  517. class_dict['class_number'] = class_number
  518. class_dict['class_location'] = class_location
  519. class_dict['class_professor_name'] = class_professor_name
  520. class_dict['class_datetime'] = class_datetime_string
  521.  
  522. #print "CLASS CREDITS"
  523. #print class_credits
  524.  
  525. #print 'CLASS NUMBER'
  526. #print class_number
  527.  
  528.  
  529. course_dict['classes'].append(class_dict)
  530. except:
  531.  
  532. print "Error getting class in course " + course_name
  533. print traceback.print_exc()
  534. print '---------------------------------------------'
  535.  
  536. course_dict['classes'].append({'class_name':'ERROR'})
  537.  
  538. department_dict['courses'].append(course_dict)
  539. except:
  540. print "Error grabbing courses from school " + school_name + " and department " + department_name
  541. print traceback.print_exc()
  542. course_dict = {'course_name':'ERROR'}
  543. department_dict['courses'].append(course_dict)
  544.  
  545. data_dict['departments'].append(department_dict)
  546.  
  547.  
  548. print json.dumps(data_dict)
  549. #after we are done extracting data from all courses/classes, go back to the
  550. #school page
  551. clicked = False
  552. while not clicked:
  553. try:
  554. browser.find_element_by_id('NYU_CLS_DERIVED_BACK').click()
  555. clicked = True
  556. print "SUCCESSFULLY CLICKED THE BACK BUTTON"
  557. except:
  558. print "COULD NOT CLICK THE BACK BUTTON. TRYING AGAIN"
  559. print "department_name: " + department_name
  560. time.sleep(1)
  561.  
  562. time.sleep(10)
  563.  
  564.  
  565. main_data_dict['schools'].append(data_dict)
  566. print "Writing main JSON to data.txt"
  567. write_to_file('data.txt',json.dumps(main_data_dict))
  568.  
  569. except:
  570. print 'ERROR LOOPING THROUGH SCHOOL'
  571. print traceback.print_exc()
  572.  
  573.  
  574.  
  575.  
  576. time.sleep(10)
  577.  
  578.  
  579.  
  580. #close selenium browser
  581. browser.close()
  582.  
  583. #connection.close()
  584.  
  585.  
  586.  
  587. main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement