Advertisement
Guest User

Untitled

a guest
Feb 28th, 2017
115
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 3.20 KB | None | 0 0
  1. from __future__ import print_function
  2. import requests
  3. import json
  4. import re
  5. import time
  6. import mysql.connector
  7.  
  8. db = mysql.connector.connect(user='root', password ='Kielce83', database='indeedProject')
  9. cursor = db.cursor()
  10.  
  11. start = time.clock()
  12.  
  13. #Format &q='exact+phrase'
  14.  
  15. api_url = 'http://api.indeed.com/ads/apisearch?publisher=6311497045529956&v=2&limit=100000&format=json'
  16.  
  17. with open("jobs.txt", "r") as f:
  18.     SearchTerms = f.read().splitlines()
  19.    
  20. #SearchTerms = set(['*'])
  21.  
  22. print (SearchTerms)
  23.  
  24. i = 0
  25. j = 0
  26. for Term in SearchTerms:
  27.     SearchTermForSave = Term
  28.     #SearchTermForScrape = '"' + SearchTermForSave + '"'
  29.     number = -25
  30.     i+=1
  31.     #Creating countries set - List of Countries
  32.     countries = set(['us'])
  33.    
  34.     #The actual Scraping
  35.     for SCountry in countries:
  36.        
  37.         Country = SCountry #this is the variable assigned to the country
  38.        
  39.         urlfirst = api_url + '&co=' + Country + '&q=' + Term + '&latlong=1' + '&start=' + str(number + 25) + '&fromage=1'
  40.        
  41.         grabforNum = requests.get(urlfirst)
  42.         json_content = json.loads(grabforNum.content)
  43.         print(json_content["totalResults"])
  44.         results = json_content["totalResults"]
  45.        
  46.         #numresults = json_content["totalResults"]
  47.         # must match the actual number of job results to the lower of the 25 increment or the last page will repeat over and over
  48.    
  49.    
  50.         for number in range(-25, results - 25, 25):  
  51.             url = api_url + '&co=' + Country + '&q=' + Term + '&latlong=1' + '&start=' + str(number + 25) + '&fromage=1'
  52.             response = requests.get(url)
  53.            
  54.             jsonResponse=json.loads(response.content)
  55.             jsonData = jsonResponse["results"]
  56.            
  57.             for item in jsonData:
  58.                 #to grab non strings use str(item.get("totalResults")
  59.                 #f.write (str(numresults))
  60.                
  61.                 date = item.get("date").encode("utf-8")
  62.                 postDate = date[5:16]
  63.                 jobtitle = item.get("jobtitle").encode("utf-8")
  64.                 company = item.get("company").encode("utf-8")
  65.                 formattedLocationFull = item.get("formattedLocationFull").encode("utf-8")
  66.                 url = item.get("url").encode("utf-8")
  67.                 latitude = str(item.get("latitude"))
  68.                 longitude = str(item.get("longitude"))
  69.  
  70.                 for term in SearchTermForSave:
  71.                     term.replace(" ", "_")
  72.  
  73.                 underscoreTerms = [s.replace(' ', '_') for s in SearchTerms]    
  74.  
  75.                 SearchTermForSave = underscoreTerms[j]
  76.  
  77.                 print (underscoreTerms[j])
  78.  
  79.                 cursor.execute("insert into " + underscoreTerms[j] + " (postDate, jobtitle, company, formattedLocationFull, url, latitude, longitude)" "values (%s, %s, %s, %s, %s, %s, %s)" , (postDate, jobtitle, company, formattedLocationFull, url, latitude, longitude))
  80.                
  81.                 db.commit()
  82.                
  83.                 print (results, 'left---Completed' , url)
  84.                 results-=1
  85.     j+=1
  86.  
  87. cursor.close()
  88. db.close()
  89. f.close()
  90.  
  91. elapsed = (time.clock() - start)
  92. print (elapsed)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement