Advertisement
Guest User

Untitled

a guest
May 27th, 2018
81
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 4.94 KB | None | 0 0
  1. from selenium import webdriver
  2. from selenium.webdriver.common.by import By
  3. from selenium.webdriver.support.ui import WebDriverWait
  4. from selenium.webdriver.support import expected_conditions as EC
  5. from selenium.common.exceptions import NoSuchElementException
  6. from selenium.common.exceptions import ElementNotVisibleException
  7. from selenium.common.exceptions import WebDriverException
  8. import re
  9. import json
  10. import pymongo
  11. import sys
  12. import queue
  13. from requests_toolbelt.threaded import pool
  14. # from mongothon import Schema
  15. from time import sleep
  16. import pandas as pd
  17.  
  18. # gets all the course pages and parses all the information on the pages
  19. def get_courses():
  20.     q = queue.Queue()
  21.  
  22.     for course in course_links_dict.keys():
  23.         q.put({'method': 'GET', 'url': course_links_dict[course]})
  24.  
  25.     # call to requests toolbelt to get all the course detail pages
  26.     p = pool.Pool(job_queue=q, num_processes=30)
  27.     p.join_all()
  28.  
  29.     for response in p.responses():
  30.         course_dict = {}
  31.         dfs = pd.read_html(response.content)
  32.  
  33.         # get semester
  34.         semester = re.findall(r'(Fall [0-9]*|Spring [0-9]*|Summer [0-9]*)', response.text)[0]
  35.         course_dict['Semester'] = semester
  36.  
  37.         # parse the main details table
  38.         course_details_columns = ['header', 'Id', 'Subject', 'Number', 'Section', 'Title',
  39.                                   'Dates', 'Days', 'Time', 'Credits', 'Status', 'Instructor',
  40.                                   'Delivery Method']
  41.         for key_num in range(len(course_details_columns)):
  42.             value = str(dfs[4].values[0][key_num]).encode('ascii', 'ignore').decode()
  43.             if value != 'nan':
  44.                 course_dict[course_details_columns[key_num]] = value
  45.  
  46.         # parse the meeting details table
  47.         meeting_details_columns = dfs[5].loc[0].values
  48.         for key_num in range(len(meeting_details_columns)):
  49.             value = str(dfs[5].loc[1].values[key_num]).encode('ascii', 'ignore').decode()
  50.             course_dict[meeting_details_columns[key_num]] = value
  51.  
  52.         # parse the location details table
  53.         for i in [0,1]:
  54.             for j in [0,1]:
  55.                 if type(dfs[6].ix[i,j]) is str:
  56.                     key, value = dfs[6].ix[i,j].split(': ')
  57.                     value = value.strip()
  58.                     course_dict[key] = value
  59.  
  60.         # parse the seat seat availability table
  61.         for item in dfs[7].values[0]:
  62.             key, value = item.split(': ')
  63.             course_dict[key] = value
  64.  
  65.         course_dict['url'] = response.url
  66.  
  67.         print(course_dict)
  68.         print('===========')
  69.         send_to_db(course_dict)
  70.  
  71.  
  72.  
  73. def send_to_db(course_dict):
  74.     pass
  75.  
  76. # iterates through pages to get all the course urls
  77. def get_course_links():
  78.     course_links = driver.find_elements_by_tag_name("a")
  79.  
  80.     for link in course_links:
  81.         url = link.get_attribute("href")
  82.         if url is not None and '/registration/search/detail' in url:
  83.             course_id = re.findall(r'courseid=[0-9]*', url)[0].split('=')[1]
  84.             course_links_dict[course_id] = url
  85.  
  86.     try:
  87.         driver.find_element_by_id("yui-pg0-0-next-link").click()
  88.     except:
  89.         return
  90.     get_course_links()
  91.  
  92.  
  93.  
  94. course_links_dict = {}
  95. try: # try to open the previous gathered course list
  96.     with open('course_links_dict', 'r') as f:
  97.         course_links_dict = eval(f.read())
  98. except:
  99.     pass
  100.  
  101. if course_links_dict == {}: # else build the course list
  102.     driver = webdriver.Chrome()
  103.     driver.get('https://eservices.minnstate.edu/registration/search/basic.html?campusid=304')
  104.     semester_actual_values = driver.find_element_by_name("yrtr").find_elements_by_tag_name('option')
  105.     subject_actual_values = driver.find_element_by_name('subject').find_elements_by_tag_name('option')
  106.     open_value = driver.find_elements_by_css_selector("input[type='radio'][name='openValue']")[2]
  107.  
  108.     # click through the form
  109.     for semester_value_index in range(len(semester_actual_values)):
  110.         for subject_value_index in range(1, len(subject_actual_values)):
  111.             semester_actual_values = driver.find_element_by_name("yrtr").find_elements_by_tag_name('option')
  112.             semester_actual_values[semester_value_index].click()
  113.  
  114.             subject_actual_values = driver.find_element_by_name('subject').find_elements_by_tag_name('option')
  115.             subject_actual_values[subject_value_index].click()
  116.  
  117.             open_value = driver.find_elements_by_css_selector("input[type='radio'][name='openValue']")[2]
  118.             open_value.click()
  119.  
  120.             driver.find_element_by_css_selector("input[type='submit'][title='Search']").click()
  121.             get_course_links()
  122.  
  123.             # get the original form page again
  124.             driver.get('https://eservices.minnstate.edu/registration/search/basic.html?campusid=304 ')
  125.  
  126.  
  127.     # write the course list out to file
  128.     with open('course_links_dict', 'w') as f:
  129.         f.write(str(course_links_dict))
  130.  
  131. get_courses()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement