SHARE
TWEET

Untitled

a guest Jul 17th, 2017 53 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. from bs4 import BeautifulSoup
  2. from urllib import request
  3. from urllib import error
  4. from selenium import webdriver
  5. import os
  6. import csv
  7.  
  8. def get_the_info(url):
  9.     global log_file
  10.     try:
  11.         response = request.urlopen(url)
  12.     except error.URLError as e:
  13.         if hasattr(e,'reson'):
  14.             print("Failed to reach server",file=log_file)
  15.             print("Reason:", e.reason,file=log_file)
  16.             print("URL:",url,file=log_file)
  17.         elif hasattr(e,'code'):
  18.             print("The server couldn\'t fulfill the request",file=log_file)
  19.             print("Error code:", e.code,file=log_file)
  20.             print("URL:",url,file=log_file)            
  21.         return None
  22.        
  23.     soup = BeautifulSoup(response, 'html.parser')
  24.    
  25.     # Getting year from the url
  26.     YEAR = url.split("/")[4][:4]
  27.     # Getting name from the page
  28.     NAME = soup.h1.text.strip()
  29.     # Getting the overview text
  30.     BACH_OVERVIEW = soup.find_all("div",class_="aside-quick-stats")[0].p.text
  31.     if BACH_OVERVIEW is None:
  32.         BACH_OVERVIEW = "--None--"
  33.  
  34.     # Testing for instituition type by word possession
  35.     if not BACH_OVERVIEW:
  36.         INSTITUTION_TYPE="INSTITUTION_TYPE"
  37.     elif "private" in BACH_OVERVIEW:
  38.         INSTITUTION_TYPE="private"
  39.     elif "public" in BACH_OVERVIEW:
  40.         INSTITUTION_TYPE="public"
  41.     elif "proprietary" in BACH_OVERVIEW:
  42.         INSTITUTION_TYPE="proprietary"
  43.     else:
  44.         INSTITUTION_TYPE="---None---"
  45.  
  46. #    content = soup.find_all("div",class_="article-content")[0]
  47. #    YEAR_FOUNDED = content.find_all("td")[7].text.strip()
  48.  
  49.     # Setting a location info1 to be searched for key word
  50.     info1 = soup.find_all("div",class_="aside-quick-stats")[0]
  51.  
  52.     # Searching for a foundation year
  53.     for i in range(len(info1.find_all("td"))):
  54.         if "ear found" in info1.find_all("td")[i].text:
  55.             YEAR_FOUNDED = info1.find_all("td")[i+1].text
  56.  
  57.     # Finding total enrollment value
  58.     TOTAL_ENROLLMENT = soup.find_all("tr",class_="total_enr_all_cy")[0].span.text.strip()
  59.  
  60.     # Three try/except clauses testing for various rankings and scores
  61.     try:
  62.         FAC_CRED_RANK = soup.find_all("tr",class_="v_faculty_and_credentials_rank")[0].span.text.strip()
  63.         FAC_CRED_SCORE = soup.find_all("tr",class_="v_faculty_and_credentials_score")[0].span.text.strip()
  64.     except:
  65.         FAC_CRED_RANK = "--None--"
  66.         FAC_CRED_SCORE = "--None--"
  67.     try:
  68.         STUD_SERV_TECH_RANK = soup.find_all("tr",class_="v_student_services_and_technology_rank")[0].span.text.strip()
  69.         STUD_SERV_TECH_SCORE = soup.find_all("tr",class_="v_student_services_and_technology_score")[0].span.text.strip()
  70.     except:
  71.         STUD_SERV_TECH_RANK = "--None--"
  72.         STUD_SERV_TECH_SCORE = "--None--"
  73.     try:
  74.         STUD_ENGAGEMENT_RANK = soup.find_all("tr",class_="v_engagement_and_accreditation_rank")[0].span.text.strip()
  75.         STUD_ENGAGEMENT_SCORE =soup.find_all("tr",class_="v_engagement_and_accreditation_score")[0].span.text.strip()
  76.     except:
  77.         STUD_ENGAGEMENT_RANK = "--None--"
  78.         STUD_ENGAGEMENT_SCORE = "--None--"
  79.  
  80.     # Full/parttime instrutional faculty values are found
  81.     FULL_TIME_INSTRUCTIONAL = soup.find_all("tr",class_="ft_faculty_count")[0].span.text.strip()
  82.     PART_TIME_INSTRUCTIONAL = soup.find_all("tr",class_="pt_faculty_count")[0].span.text.strip()
  83.  
  84.     # Finding enrollnment numbers for average age of the students
  85.     AVG_STUD_AGE = soup.find_all("tr",class_="average_enrollment_age")[0].span.text.strip()
  86.  
  87.     # Setting base location for tuition information
  88.     tuition = soup.find(class_="fields free_paying")
  89.     # creating an list of the coloumn contents
  90.     sf = tuition.find_all("td")
  91.     # PreSetting values incase they are not included in coloumn
  92.     INSTATE_TUITION_PER_CREDIT = "---None---"
  93.     OUTSTATE_TUITION_PER_CREDIT = "---None---"
  94.     INTERNATIOONAL_TUITUION_PER_CREDIT = "---None---"
  95.    
  96.     # Search each column for different tuition types
  97.     for i in range(len(sf)):
  98.         if "In-state, out-of-district tuition for U.S. students (per credit)" in sf[i].contents[0]:
  99.             INSTATE_TUITION_PER_CREDIT = sf[i+1].span.string.strip()
  100.         elif "Tuition for U.S. students (per credit)" in sf[i].contents[0]:
  101.             INSTATE_TUITION_PER_CREDIT = sf[i+1].span.string.strip()
  102.             OUTSTATE_TUITION_PER_CREDIT = INSTATE_TUITION_PER_CREDIT              
  103.         elif "Out-of-state tuition for U.S. students (per credit)" in sf[i].contents[0]:
  104.             OUTSTATE_TUITION_PER_CREDIT = sf[i+1].span.string.strip()
  105.         elif "Tuition for international students (per credit)" in sf[i].contents[0]:
  106.             INTERNATIOONAL_TUITUION_PER_CREDIT = sf[i+1].span.string.strip()
  107.  
  108.     # the url used to find the site
  109.     ARCHIVE_LINK = url
  110.        
  111.     return (YEAR,
  112.             NAME,
  113.             BACH_OVERVIEW,
  114.             INSTITUTION_TYPE,
  115.             YEAR_FOUNDED,
  116.             TOTAL_ENROLLMENT,
  117.             FAC_CRED_RANK,
  118.             FAC_CRED_SCORE,
  119.             STUD_SERV_TECH_RANK,
  120.             STUD_SERV_TECH_SCORE,
  121.             STUD_ENGAGEMENT_RANK,
  122.             STUD_ENGAGEMENT_SCORE,
  123.             FULL_TIME_INSTRUCTIONAL,
  124.             PART_TIME_INSTRUCTIONAL,
  125.             AVG_STUD_AGE,
  126.             INSTATE_TUITION_PER_CREDIT,
  127.             OUTSTATE_TUITION_PER_CREDIT,
  128.             INTERNATIOONAL_TUITUION_PER_CREDIT,
  129.             ARCHIVE_LINK)
  130.  
  131.  
  132. if __name__ == "__main__":
  133.  
  134.     # Opening a file containing a listing of college urls
  135.     # BACH i.e. bachelors
  136.     school_sites = open('BACH.txt','r')
  137.     SCHOOL_SITES = []
  138.     # Create a list of the urls
  139.     for site in school_sites:
  140.         SCHOOL_SITES.append(site[:-1])
  141.  
  142.     # Open a log and output file
  143.     log_file = open('the_log2.txt','a')
  144.     school_info = open('the_college_data2.csv','a')
  145.     # Create output writer
  146.     csvwriter = csv.writer(school_info,dialect='unix')
  147.     # create a PhantomJS webdriver instance
  148.     driver = webdriver.PhantomJS()
  149.     # Make a dirctory for screenshots
  150.     os.mkdir("the_ScreenShots")
  151.  
  152.     # for each url grab the school informaiton and take a screenshot
  153.     # save the screenshot in the created directory
  154.     for url in SCHOOL_SITES:
  155.         print(url)
  156.         # Clearing variable information for each attepmted url
  157.         the_school_info = []
  158.         try:
  159.             driver.get(url)
  160.             # parsing  the url for use in nameing format
  161.             url_Split = url.split("/")
  162.             # save screenshot containing "date-university_name"
  163.             driver.save_screenshot("the_ScreenShots/"+url_Split[4]+"-"+url_Split[-2]+".png")
  164.             # use the above function to get the needed values returned as a tuple
  165.             the_school_info = get_the_info(url)
  166.         except:
  167.             print("Not complete:",url,file=log_file)
  168.             continue
  169.        
  170.         # as long as there is a info write to file the contents
  171.         if the_school_info:
  172.             csvwriter.writerow(the_school_info)
  173.    
  174.     school_info.close()
  175.     log_file.close()
  176.     school_sites.close()
RAW Paste Data
Top