Advertisement
Guest User

Untitled

a guest
Jul 17th, 2017
90
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 7.00 KB | None | 0 0
  1. from bs4 import BeautifulSoup
  2. from urllib import request
  3. from urllib import error
  4. from selenium import webdriver
  5. import os
  6. import csv
  7.  
  8. def get_the_info(url):
  9.     global log_file
  10.     try:
  11.         response = request.urlopen(url)
  12.     except error.URLError as e:
  13.         if hasattr(e,'reson'):
  14.             print("Failed to reach server",file=log_file)
  15.             print("Reason:", e.reason,file=log_file)
  16.             print("URL:",url,file=log_file)
  17.         elif hasattr(e,'code'):
  18.             print("The server couldn\'t fulfill the request",file=log_file)
  19.             print("Error code:", e.code,file=log_file)
  20.             print("URL:",url,file=log_file)            
  21.         return None
  22.        
  23.     soup = BeautifulSoup(response, 'html.parser')
  24.    
  25.     # Getting year from the url
  26.     YEAR = url.split("/")[4][:4]
  27.     # Getting name from the page
  28.     NAME = soup.h1.text.strip()
  29.     # Getting the overview text
  30.     BACH_OVERVIEW = soup.find_all("div",class_="aside-quick-stats")[0].p.text
  31.     if BACH_OVERVIEW is None:
  32.         BACH_OVERVIEW = "--None--"
  33.  
  34.     # Testing for instituition type by word possession
  35.     if not BACH_OVERVIEW:
  36.         INSTITUTION_TYPE="INSTITUTION_TYPE"
  37.     elif "private" in BACH_OVERVIEW:
  38.         INSTITUTION_TYPE="private"
  39.     elif "public" in BACH_OVERVIEW:
  40.         INSTITUTION_TYPE="public"
  41.     elif "proprietary" in BACH_OVERVIEW:
  42.         INSTITUTION_TYPE="proprietary"
  43.     else:
  44.         INSTITUTION_TYPE="---None---"
  45.  
  46. #    content = soup.find_all("div",class_="article-content")[0]
  47. #    YEAR_FOUNDED = content.find_all("td")[7].text.strip()
  48.  
  49.     # Setting a location info1 to be searched for key word
  50.     info1 = soup.find_all("div",class_="aside-quick-stats")[0]
  51.  
  52.     # Searching for a foundation year
  53.     for i in range(len(info1.find_all("td"))):
  54.         if "ear found" in info1.find_all("td")[i].text:
  55.             YEAR_FOUNDED = info1.find_all("td")[i+1].text
  56.  
  57.     # Finding total enrollment value
  58.     TOTAL_ENROLLMENT = soup.find_all("tr",class_="total_enr_all_cy")[0].span.text.strip()
  59.  
  60.     # Three try/except clauses testing for various rankings and scores
  61.     try:
  62.         FAC_CRED_RANK = soup.find_all("tr",class_="v_faculty_and_credentials_rank")[0].span.text.strip()
  63.         FAC_CRED_SCORE = soup.find_all("tr",class_="v_faculty_and_credentials_score")[0].span.text.strip()
  64.     except:
  65.         FAC_CRED_RANK = "--None--"
  66.         FAC_CRED_SCORE = "--None--"
  67.     try:
  68.         STUD_SERV_TECH_RANK = soup.find_all("tr",class_="v_student_services_and_technology_rank")[0].span.text.strip()
  69.         STUD_SERV_TECH_SCORE = soup.find_all("tr",class_="v_student_services_and_technology_score")[0].span.text.strip()
  70.     except:
  71.         STUD_SERV_TECH_RANK = "--None--"
  72.         STUD_SERV_TECH_SCORE = "--None--"
  73.     try:
  74.         STUD_ENGAGEMENT_RANK = soup.find_all("tr",class_="v_engagement_and_accreditation_rank")[0].span.text.strip()
  75.         STUD_ENGAGEMENT_SCORE =soup.find_all("tr",class_="v_engagement_and_accreditation_score")[0].span.text.strip()
  76.     except:
  77.         STUD_ENGAGEMENT_RANK = "--None--"
  78.         STUD_ENGAGEMENT_SCORE = "--None--"
  79.  
  80.     # Full/parttime instrutional faculty values are found
  81.     FULL_TIME_INSTRUCTIONAL = soup.find_all("tr",class_="ft_faculty_count")[0].span.text.strip()
  82.     PART_TIME_INSTRUCTIONAL = soup.find_all("tr",class_="pt_faculty_count")[0].span.text.strip()
  83.  
  84.     # Finding enrollnment numbers for average age of the students
  85.     AVG_STUD_AGE = soup.find_all("tr",class_="average_enrollment_age")[0].span.text.strip()
  86.  
  87.     # Setting base location for tuition information
  88.     tuition = soup.find(class_="fields free_paying")
  89.     # creating an list of the coloumn contents
  90.     sf = tuition.find_all("td")
  91.     # PreSetting values incase they are not included in coloumn
  92.     INSTATE_TUITION_PER_CREDIT = "---None---"
  93.     OUTSTATE_TUITION_PER_CREDIT = "---None---"
  94.     INTERNATIOONAL_TUITUION_PER_CREDIT = "---None---"
  95.    
  96.     # Search each column for different tuition types
  97.     for i in range(len(sf)):
  98.         if "In-state, out-of-district tuition for U.S. students (per credit)" in sf[i].contents[0]:
  99.             INSTATE_TUITION_PER_CREDIT = sf[i+1].span.string.strip()
  100.         elif "Tuition for U.S. students (per credit)" in sf[i].contents[0]:
  101.             INSTATE_TUITION_PER_CREDIT = sf[i+1].span.string.strip()
  102.             OUTSTATE_TUITION_PER_CREDIT = INSTATE_TUITION_PER_CREDIT              
  103.         elif "Out-of-state tuition for U.S. students (per credit)" in sf[i].contents[0]:
  104.             OUTSTATE_TUITION_PER_CREDIT = sf[i+1].span.string.strip()
  105.         elif "Tuition for international students (per credit)" in sf[i].contents[0]:
  106.             INTERNATIOONAL_TUITUION_PER_CREDIT = sf[i+1].span.string.strip()
  107.  
  108.     # the url used to find the site
  109.     ARCHIVE_LINK = url
  110.        
  111.     return (YEAR,
  112.             NAME,
  113.             BACH_OVERVIEW,
  114.             INSTITUTION_TYPE,
  115.             YEAR_FOUNDED,
  116.             TOTAL_ENROLLMENT,
  117.             FAC_CRED_RANK,
  118.             FAC_CRED_SCORE,
  119.             STUD_SERV_TECH_RANK,
  120.             STUD_SERV_TECH_SCORE,
  121.             STUD_ENGAGEMENT_RANK,
  122.             STUD_ENGAGEMENT_SCORE,
  123.             FULL_TIME_INSTRUCTIONAL,
  124.             PART_TIME_INSTRUCTIONAL,
  125.             AVG_STUD_AGE,
  126.             INSTATE_TUITION_PER_CREDIT,
  127.             OUTSTATE_TUITION_PER_CREDIT,
  128.             INTERNATIOONAL_TUITUION_PER_CREDIT,
  129.             ARCHIVE_LINK)
  130.  
  131.  
  132. if __name__ == "__main__":
  133.  
  134.     # Opening a file containing a listing of college urls
  135.     # BACH i.e. bachelors
  136.     school_sites = open('BACH.txt','r')
  137.     SCHOOL_SITES = []
  138.     # Create a list of the urls
  139.     for site in school_sites:
  140.         SCHOOL_SITES.append(site[:-1])
  141.  
  142.     # Open a log and output file
  143.     log_file = open('the_log2.txt','a')
  144.     school_info = open('the_college_data2.csv','a')
  145.     # Create output writer
  146.     csvwriter = csv.writer(school_info,dialect='unix')
  147.     # create a PhantomJS webdriver instance
  148.     driver = webdriver.PhantomJS()
  149.     # Make a dirctory for screenshots
  150.     os.mkdir("the_ScreenShots")
  151.  
  152.     # for each url grab the school informaiton and take a screenshot
  153.     # save the screenshot in the created directory
  154.     for url in SCHOOL_SITES:
  155.         print(url)
  156.         # Clearing variable information for each attepmted url
  157.         the_school_info = []
  158.         try:
  159.             driver.get(url)
  160.             # parsing  the url for use in nameing format
  161.             url_Split = url.split("/")
  162.             # save screenshot containing "date-university_name"
  163.             driver.save_screenshot("the_ScreenShots/"+url_Split[4]+"-"+url_Split[-2]+".png")
  164.             # use the above function to get the needed values returned as a tuple
  165.             the_school_info = get_the_info(url)
  166.         except:
  167.             print("Not complete:",url,file=log_file)
  168.             continue
  169.        
  170.         # as long as there is a info write to file the contents
  171.         if the_school_info:
  172.             csvwriter.writerow(the_school_info)
  173.    
  174.     school_info.close()
  175.     log_file.close()
  176.     school_sites.close()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement