Advertisement
Guest User

Untitled

a guest
Oct 17th, 2017
130
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 7.10 KB | None | 0 0
  1. from selenium.webdriver.common.keys import Keys
  2. from selenium import webdriver
  3. from bs4 import BeautifulSoup
  4. from collections import defaultdict
  5. import urllib.request
  6. import requests
  7. import sqlite3
  8. import json
  9. import os
  10. import sys
  11. import time
  12.  
  13. def setup_webdriver(path_to_driver):
  14.     """set up webdriver"""
  15.     chromedriver = path_to_driver
  16.     os.environ["webdriver.chrome.driver"] = chromedriver
  17.     driver = webdriver.Chrome(chromedriver)
  18.     return driver  
  19.  
  20.  
  21. def create_soup(url):
  22.     """create bs4 object"""
  23.     r = requests.get(url, headers={"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36"})
  24.     return BeautifulSoup(r.content, "html5lib")
  25.  
  26.  
  27. def queryJobs(driver, jobTitle, jobLocation):
  28.     """given an active driver, searches & scrapes job postings into dictionary """
  29.     # go to url & search for jobs ordered by last 7 days, 500 jobs/page & most recent first
  30.     query_url  = 'https://www.careeronestop.org/JobSearch/job-Search.aspx'
  31.     driver.get(query_url)
  32.     driver.find_element_by_id('txtLocation').clear()
  33.     driver.find_element_by_id('txtLocation').send_keys(jobLocation)
  34.     driver.find_element_by_id('txtOccupation').clear()
  35.     driver.find_element_by_id('txtOccupation').send_keys(jobTitle)
  36.     driver.find_element_by_id('btnFindJob').click()
  37.     driver.get(driver.current_url + '&datfilter=7' + '&pagesize=500' + '&sortcolumns=accquisitiondate&sortdirections=DSC')
  38.    
  39.     # get current url to scrape all links
  40.     searchResultsUrl = driver.current_url
  41.    
  42.     searchResultsSoup = create_soup(searchResultsUrl)
  43.     table = searchResultsSoup.find('div', {'class':'datagrid no-more-tables'})
  44.     body = table.find('tbody')
  45.    
  46.     # create separate lists & zip into dictionary
  47.     company = [i.get_text() for i in body.find_all('td', {'data-title': 'Company'})]
  48.     title = [i.get_text() for i in body.find_all('td', {'data-title': 'Job Title'})]
  49.     location = [i.get_text() for i in body.find_all('td', {'data-title': 'Location'})]
  50.     title1 = [i.strip() for i in title]
  51.     url = [i['href'] for i in body.find_all('a', {'target': '_blank'})]
  52.    
  53.     jobsDict = {z[0]:list(z[1:]) for z in zip(url,title1,company, location)}
  54.     return jobsDict
  55.  
  56.  
  57. def create_db_table(full_path_to_db):
  58.     """create database table & names columns"""
  59.     conn = sqlite3.connect(full_path_to_db)
  60.     c = conn.cursor()
  61.     c.execute('''CREATE TABLE career_jobs
  62.        (id integer primary key, data,
  63.        company_name text,
  64.        job_title text)''')
  65.  
  66.     conn.commit()
  67.     conn.close()
  68.  
  69. def filterJobTitles(jobsDict, keywordsList, all_=True):
  70.     """any=True, all=False = matches all
  71.       any=True, all=True = matches any"""
  72.     updatedJobsDict = {}
  73.     anyCount = 0
  74.     allCount = 0
  75.     for k,v in jobsDict.items():
  76.         lower = v[0].lower()
  77.         if all_:
  78.             if all(word in lower for word in keywordsList):
  79.                 allCount+=1
  80.                 updatedJobsDict[k] = v
  81.         else:
  82.             if any(word in lower for word in keywordsList):
  83.                 anyCount+=1
  84.                 updatedJobsDict[k] = v
  85.    
  86.     if allCount > 0:
  87.         print('Searching for all word to match')
  88.     if anyCount > 0:
  89.         print('Searching for any words to match')
  90.        
  91.     if allCount == 0:
  92.         print('No all matches found')
  93.     if anyCount == 0:
  94.         print('No any matches found')
  95.      
  96.     return updatedJobsDict
  97.  
  98.  
  99. def addToDb(dbPath, jobsDict):
  100.  
  101.     uniqueURLS = []
  102.     for k,v in jobsDict.items():
  103.         URL = k
  104.         jobTitle = v[0]
  105.         jobCompany = v[1]
  106.    
  107.         conn = sqlite3.connect(dbPath)
  108.         c = conn.cursor()
  109.  
  110.         c.execute('SELECT * FROM career_jobs WHERE (company_name=? AND job_title=?)', (jobCompany, jobTitle))
  111.         entry = c.fetchone()
  112.  
  113.         if entry is None:
  114.             c.execute("insert or ignore into career_jobs (company_name, job_title) values (?, ?)", (jobCompany, jobTitle))
  115.             conn.commit()
  116.             uniqueURLS.append(URL)
  117.             print('\nNew Entry added\n{} - {}'.format(jobTitle, jobCompany))
  118.         else:
  119.             print ('\n>>>>>>>>>Entry found<<<<<<<<<\n', jobTitle, jobCompany)
  120.            
  121.     return uniqueURLS
  122.  
  123. def scrapeOneStop(driverPath, jobTitle, jobLocation, dbPath, firstTable=None,
  124.                   all_=None, keywordsList=None):
  125.    
  126.     """combines all functions"""
  127.     driver = setup_webdriver(driverPath)
  128.    
  129.     if firstTable:
  130.         create_db_table(dbPath)
  131.    
  132.     jobsDict = queryJobs(driver, jobTitle, jobLocation)
  133.    
  134.     if keywordsList:
  135.         jobsDict = filterJobTitles(jobsDict, all_=all_, keywordsList=keywordsList)
  136.        
  137.     if len(jobsDict) == 0:
  138.         print('No matches found.  Try the any/all instead of all/any')
  139.         input('Press enter to exit.')
  140.         sys.exit(1)
  141.     uniqueURLS =  addToDb(dbPath, jobsDict)
  142.  
  143.     for idx,i in enumerate(uniqueURLS):
  144.         concatURL = 'window.open("' + i + '","_blank");'
  145.         driver.execute_script(concatURL)
  146.         print('\nShowing ', idx, ' of ', len(uniqueURLS))
  147.         if idx %10 == 0:
  148.             input('Press enter to continue')
  149.  
  150.  
  151.  
  152.  
  153.  
  154. # user inputs
  155. dbPath = input('\nEnter FULL PATH to databse ending with .sqlite\n> ')
  156. dbCreate = input('\nIs this a new database? true/false\n> ')
  157. driverPath = input('\nEnter FULL PATH to web driver\n> ')
  158. jobTitle = input('\nEnter job title to search\n> ')
  159. jobLocation = input('\nEnter job location\n> ')
  160. filterOrNot = input('\nFilter job titles by keywords? y/n\n> ')
  161.  
  162.  
  163. # logic
  164. dbCreateBool = json.loads(dbCreate)
  165. if filterOrNot in ['y', 'yes']:
  166.     typeMatch = input('\nMatch all of the words in keywords? true/false\n> ')
  167.     keywords = input('\nEnter space delimited list of keywords.\ne.g. junior software python\n> ')
  168.    
  169.     # format inputs for functions
  170.     matchList = list(keywords.split())
  171.     bools = json.loads(typeMatch)
  172.  
  173.     if dbCreate == 'true' and bools == True:
  174.         scrapeOneStop(driverPath, jobTitle, jobLocation, dbPath,
  175.                       firstTable=dbCreateBool, all_=True, keywordsList=matchList)
  176.         input('Press enter to exit')
  177.        
  178.     elif dbCreate == 'true' and bools !=True:
  179.         scrapeOneStop(driverPath, jobTitle, jobLocation, dbPath,
  180.                       firstTable=dbCreateBool, all_=False, keywordsList=matchList)
  181.         input('Press enter to exit')
  182.        
  183.     elif dbCreate == 'false' and not bools == True:
  184.         scrapeOneStop(driverPath, jobTitle, jobLocation, dbPath, all_=True, keywordsList=matchList)
  185.         input('Press enter to exit')
  186.  
  187.     elif dbCreate == 'false' and bools == False:
  188.         scrapeOneStop(driverPath, jobTitle, jobLocation, dbPath, all_=False, keywordsList=matchList)
  189.         input('Press enter to exit')
  190.  
  191. else:
  192.     if dbCreate == 'true':
  193.         print('other else')
  194.         scrapeOneStop(driverPath, jobTitle, jobLocation, dbPath, firstTable=dbCreateBool)
  195.         input('Press enter to exit')
  196.     else:
  197.         print('other other else')
  198.         scrapeOneStop(driverPath, jobTitle, jobLocation, dbPath)
  199.         input('Press enter to exit')
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement