Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from __future__ import print_function
- import requests
- import json
- import re
- import time
- import mysql.connector
- db = mysql.connector.connect(user='root', password ='Kielce83', database='indeedProject')
- cursor = db.cursor()
- start = time.clock()
- #Format &q='exact+phrase'
- api_url = 'http://api.indeed.com/ads/apisearch?publisher=6311497045529956&v=2&limit=100000&format=json'
- with open("jobs.txt", "r") as f:
- SearchTerms = f.read().splitlines()
- #SearchTerms = set(['*'])
- print (SearchTerms)
- i = 0
- j = 0
- for Term in SearchTerms:
- SearchTermForSave = Term
- #SearchTermForScrape = '"' + SearchTermForSave + '"'
- number = -25
- i+=1
- #Creating countries set - List of Countries
- countries = set(['us'])
- #The actual Scraping
- for SCountry in countries:
- Country = SCountry #this is the variable assigned to the country
- urlfirst = api_url + '&co=' + Country + '&q=' + Term + '&latlong=1' + '&start=' + str(number + 25) + '&fromage=1'
- grabforNum = requests.get(urlfirst)
- json_content = json.loads(grabforNum.content)
- print(json_content["totalResults"])
- results = json_content["totalResults"]
- #numresults = json_content["totalResults"]
- # must match the actual number of job results to the lower of the 25 increment or the last page will repeat over and over
- for number in range(-25, results - 25, 25):
- url = api_url + '&co=' + Country + '&q=' + Term + '&latlong=1' + '&start=' + str(number + 25) + '&fromage=1'
- response = requests.get(url)
- jsonResponse=json.loads(response.content)
- jsonData = jsonResponse["results"]
- for item in jsonData:
- #to grab non strings use str(item.get("totalResults")
- #f.write (str(numresults))
- date = item.get("date").encode("utf-8")
- postDate = date[5:16]
- jobtitle = item.get("jobtitle").encode("utf-8")
- company = item.get("company").encode("utf-8")
- formattedLocationFull = item.get("formattedLocationFull").encode("utf-8")
- url = item.get("url").encode("utf-8")
- latitude = str(item.get("latitude"))
- longitude = str(item.get("longitude"))
- for term in SearchTermForSave:
- term.replace(" ", "_")
- underscoreTerms = [s.replace(' ', '_') for s in SearchTerms]
- SearchTermForSave = underscoreTerms[j]
- print (underscoreTerms[j])
- cursor.execute("insert into " + underscoreTerms[j] + " (postDate, jobtitle, company, formattedLocationFull, url, latitude, longitude)" "values (%s, %s, %s, %s, %s, %s, %s)" , (postDate, jobtitle, company, formattedLocationFull, url, latitude, longitude))
- db.commit()
- print (results, 'left---Completed' , url)
- results-=1
- j+=1
- cursor.close()
- db.close()
- f.close()
- elapsed = (time.clock() - start)
- print (elapsed)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement