Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from selenium import webdriver
- from selenium.webdriver.common.by import By
- from selenium.webdriver.support.ui import WebDriverWait
- from selenium.webdriver.support import expected_conditions as EC
- from selenium.common.exceptions import NoSuchElementException
- from selenium.common.exceptions import ElementNotVisibleException
- from selenium.common.exceptions import WebDriverException
- import re
- import json
- import pymongo
- import sys
- import queue
- from requests_toolbelt.threaded import pool
- # from mongothon import Schema
- from time import sleep
- import pandas as pd
- # gets all the course pages and parses all the information on the pages
- def get_courses():
- q = queue.Queue()
- for course in course_links_dict.keys():
- q.put({'method': 'GET', 'url': course_links_dict[course]})
- # call to requests toolbelt to get all the course detail pages
- p = pool.Pool(job_queue=q, num_processes=30)
- p.join_all()
- for response in p.responses():
- course_dict = {}
- dfs = pd.read_html(response.content)
- # get semester
- semester = re.findall(r'(Fall [0-9]*|Spring [0-9]*|Summer [0-9]*)', response.text)[0]
- course_dict['Semester'] = semester
- # parse the main details table
- course_details_columns = ['header', 'Id', 'Subject', 'Number', 'Section', 'Title',
- 'Dates', 'Days', 'Time', 'Credits', 'Status', 'Instructor',
- 'Delivery Method']
- for key_num in range(len(course_details_columns)):
- value = str(dfs[4].values[0][key_num]).encode('ascii', 'ignore').decode()
- if value != 'nan':
- course_dict[course_details_columns[key_num]] = value
- # parse the meeting details table
- meeting_details_columns = dfs[5].loc[0].values
- for key_num in range(len(meeting_details_columns)):
- value = str(dfs[5].loc[1].values[key_num]).encode('ascii', 'ignore').decode()
- course_dict[meeting_details_columns[key_num]] = value
- # parse the location details table
- for i in [0,1]:
- for j in [0,1]:
- if type(dfs[6].ix[i,j]) is str:
- key, value = dfs[6].ix[i,j].split(': ')
- value = value.strip()
- course_dict[key] = value
- # parse the seat seat availability table
- for item in dfs[7].values[0]:
- key, value = item.split(': ')
- course_dict[key] = value
- course_dict['url'] = response.url
- print(course_dict)
- print('===========')
- send_to_db(course_dict)
- def send_to_db(course_dict):
- pass
- # iterates through pages to get all the course urls
- def get_course_links():
- course_links = driver.find_elements_by_tag_name("a")
- for link in course_links:
- url = link.get_attribute("href")
- if url is not None and '/registration/search/detail' in url:
- course_id = re.findall(r'courseid=[0-9]*', url)[0].split('=')[1]
- course_links_dict[course_id] = url
- try:
- driver.find_element_by_id("yui-pg0-0-next-link").click()
- except:
- return
- get_course_links()
- course_links_dict = {}
- try: # try to open the previous gathered course list
- with open('course_links_dict', 'r') as f:
- course_links_dict = eval(f.read())
- except:
- pass
- if course_links_dict == {}: # else build the course list
- driver = webdriver.Chrome()
- driver.get('https://eservices.minnstate.edu/registration/search/basic.html?campusid=304')
- semester_actual_values = driver.find_element_by_name("yrtr").find_elements_by_tag_name('option')
- subject_actual_values = driver.find_element_by_name('subject').find_elements_by_tag_name('option')
- open_value = driver.find_elements_by_css_selector("input[type='radio'][name='openValue']")[2]
- # click through the form
- for semester_value_index in range(len(semester_actual_values)):
- for subject_value_index in range(1, len(subject_actual_values)):
- semester_actual_values = driver.find_element_by_name("yrtr").find_elements_by_tag_name('option')
- semester_actual_values[semester_value_index].click()
- subject_actual_values = driver.find_element_by_name('subject').find_elements_by_tag_name('option')
- subject_actual_values[subject_value_index].click()
- open_value = driver.find_elements_by_css_selector("input[type='radio'][name='openValue']")[2]
- open_value.click()
- driver.find_element_by_css_selector("input[type='submit'][title='Search']").click()
- get_course_links()
- # get the original form page again
- driver.get('https://eservices.minnstate.edu/registration/search/basic.html?campusid=304 ')
- # write the course list out to file
- with open('course_links_dict', 'w') as f:
- f.write(str(course_links_dict))
- get_courses()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement