Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from __future__ import print_function
- import requests
- import json
- import re
- import time
- import mysql.connector
- db = mysql.connector.connect(user='root', password ='Kielce83', database='indeedProject')
- cursor = db.cursor()
- start = time.clock()
- #Format &q='exact+phrase'
- api_url = 'http://api.indeed.com/ads/apisearch?publisher=6311497045529956&v=2&limit=100000&format=json'
- with open("jobs.txt", "r") as f:
- SearchTerms = f.read().splitlines()
- Regions = set(['AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'FL', 'GA', 'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 'MD', 'MA', 'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NH', 'NJ', 'NM', 'NY', 'NM', 'NY', 'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'TN', 'TX', 'UT', 'VT', 'VA', 'WA', 'WV', 'WI', 'WY'])
- print (SearchTerms)
- for Term in SearchTerms:
- urlfirst = api_url + '&co=us' + '&q=' + Term + '&latlong=1' + '&start=0' + '&fromage=1'
- underscoreTerm = Term.replace(" ", "_")
- grabforNum = requests.get(urlfirst)
- json_content = json.loads(grabforNum.content)
- print(json_content["totalResults"])
- results = json_content["totalResults"]
- # must match the actual number of job results to the lower of the 25 increment or the last page will repeat over and over
- if (results > 1025):
- for Region in Regions:
- urlfirst = api_url + '&co=us' + '&q=' + Term + '&latlong=1' + '&start=0' + '&fromage=1' + '&l=' + Region
- grabforNum = requests.get(urlfirst)
- json_content = json.loads(grabforNum.content)
- regionResults = json_content["totalResults"]
- for number in range(-25, regionResults - 25, 25):
- url = api_url + '&co=us' + '&q=' + Term + '&latlong=1' + '&start=' + str(number + 25) + '&fromage=1' + '&l=' + Region
- response = requests.get(url)
- jsonResponse=json.loads(response.content)
- jsonData = jsonResponse["results"]
- for item in jsonData:
- #to grab non strings use str(item.get("totalResults")
- #f.write (str(numresults))
- date = item.get("date").encode("utf-8")
- postDate = date[5:16]
- jobtitle = item.get("jobtitle").encode("utf-8")
- company = item.get("company").encode("utf-8")
- formattedLocationFull = item.get("formattedLocationFull").encode("utf-8")
- url = item.get("url").encode("utf-8")
- latitude = str(item.get("latitude"))
- longitude = str(item.get("longitude"))
- #region = item.get("l")
- cursor.execute("insert into " + underscoreTerm + " (postDate, jobtitle, company, formattedLocationFull, url, latitude, longitude)" "values (%s, %s, %s, %s, %s, %s, %s)" , (postDate, jobtitle, company, formattedLocationFull, url, latitude, longitude))
- db.commit()
- print (results, 'left---Completed' , url)
- results-=1
- else:
- for number in range(-25, results - 25, 25):
- url = api_url + '&co=us' + '&q=' + Term + '&latlong=1' + '&start=' + str(number + 25) + '&fromage=1'
- response = requests.get(url)
- jsonResponse=json.loads(response.content)
- jsonData = jsonResponse["results"]
- for item in jsonData:
- #to grab non strings use str(item.get("totalResults")
- #f.write (str(numresults))
- date = item.get("date").encode("utf-8")
- postDate = date[5:16]
- jobtitle = item.get("jobtitle").encode("utf-8")
- company = item.get("company").encode("utf-8")
- formattedLocationFull = item.get("formattedLocationFull").encode("utf-8")
- url = item.get("url").encode("utf-8")
- latitude = str(item.get("latitude"))
- longitude = str(item.get("longitude"))
- #region = item.get("l")
- cursor.execute("insert into " + underscoreTerm + " (postDate, jobtitle, company, formattedLocationFull, url, latitude, longitude)" "values (%s, %s, %s, %s, %s, %s, %s)" , (postDate, jobtitle, company, formattedLocationFull, url, latitude, longitude))
- db.commit()
- print (results, 'left---Completed' , url)
- results-=1
- cursor.close()
- db.close()
- f.close()
- elapsed = (time.clock() - start)
- print (elapsed)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement