daily pastebin goal
72%
SHARE
TWEET

Untitled

a guest Feb 13th, 2018 82 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. #we can pretty much just reuse everything.
  2.  
  3. # Fixed url for job postings containing data scientist
  4. url = 'http://www.indeed.com/jobs?q=data+scientist&l='
  5. # read the website
  6. source = urllib2.urlopen(url).read()
  7. # parse html code
  8. bs_tree = bs4.BeautifulSoup(source)
  9.  
  10. # see how many job postings we found
  11. job_count_string = bs_tree.find(id = 'searchCount').contents[0]
  12. job_count_string = job_count_string.split()[-2]
  13. print("Search yielded %s hits." % (job_count_string))
  14. # print(int(job_count_string.replace(',', '')))
  15.  
  16. # not that job_count so far is still a string,
  17. # not an integer, and the , separator prevents
  18. # us from just casting it to int
  19.  
  20. job_count_digits = [int(d) for d in job_count_string if d.isdigit()]
  21. job_count = np.sum([digit*(10**exponent) for digit, exponent in
  22.                     zip(job_count_digits[::-1], range(len(job_count_digits)))])
  23.  
  24. print job_count
  25.  
  26. num_pages = int(np.ceil(job_count/10.0))
  27. base_url = 'http://www.indeed.com/viewjob?'
  28. job_links = []
  29. for i in range(100): #do range(num_pages) if you want them all
  30.     url = 'http://www.indeed.com/jobs?q=data+scientist&start=' + str(i*10)
  31.     html_page = urllib2.urlopen(url).read()
  32.     bs_tree = bs4.BeautifulSoup(html_page)
  33.     job_link_area = bs_tree.find(id = 'resultsCol')
  34.     job_postings = job_link_area.findAll("div")
  35.     job_postings = [jp for jp in job_postings if not jp.get('class') is None
  36.                     and ''.join(jp.get('class')) =="rowresult"]
  37.     #print(job_postings)
  38.     job_ids = [jp.get('data-jk') for jp in job_postings]
  39.  
  40.     # go after each link
  41.     for id in job_ids:
  42.         if(len(job_links) == 1000):
  43.             break
  44.         else:
  45.             job_links.append(base_url + 'jk=' + id)
  46.  
  47.         time.sleep(1)
  48.  
  49. skill_set = {
  50.     'mapreduce':0, 'spark':0, 'visualization':0, 'hadoop':0, 'sql':0,
  51.     'teradata':0, 'sas':0, 'excel':0, 'access':0, 'java':0,
  52.     'r':0, 'python':0, 'machine learning':0, 'statistics':0, 'aws':0,
  53.     'databricks':0, 'nlp':0, 'tableau':0, 'watson':0, 'scikit-learn':0
  54. }
  55. counter = 0
  56. j_arr = []
  57. for link in job_links:
  58.     counter +=1  
  59.     html_page = ""
  60.     try:
  61.         html_page = urllib2.urlopen(link).read()
  62.     except urllib2.HTTPError:
  63.         print "HTTPError:"
  64.         continue
  65.     except urllib2.URLError:
  66.         print "URLError:"
  67.         continue
  68.     except socket.error:
  69.         print "Connection closed"
  70.         continue
  71.  
  72.     html_text = re.sub("[^a-z.+3]"," ", html_page.lower()) # replace all but the listed characters
  73.     tree2 = bs4.BeautifulSoup(html_page)
  74.     # parse for title, name, location, skills
  75.     header = str(tree2.find('title'))#.replace("<title>", "").replace("</title>", "")
  76.     #print header
  77.     # e.g. "Data Scientist - Reliability job - Tesla - Palo Alto, CA | Indeed.com"
  78.  
  79.     #"Data Scientist - Reliability job - Tesla - Palo Alto, CA"
  80.     header = header.split("|")[0]
  81.  
  82.     #"Data Scientist - Reliability"
  83.     job_t = header.split(" job - ")[0].replace("<title>", "")
  84.     #print "t: " + job_t
  85.    
  86.     #"Tesla"
  87.     job_c = header.split(" job - ")[1].split(" - ")[0]
  88.     #print "c: " + job_c
  89.    
  90.     #"Palo Alto, CA"
  91.     job_l = header.split(" job - ")[1].split(" - ")[1]
  92.     #print "l: " + job_l
  93.    
  94.     job_s = []
  95.     for key in skill_set.keys():
  96.         if key in html_text:  
  97.             skill_set[key] +=1
  98.             job_s.append(key)
  99.         else:
  100.             job_s.append(None)
  101.    
  102.     j = tuple([job_t, job_c, job_l] + job_s)
  103.     j_arr.append(j)
  104.  
  105.     if counter % 5 == 0:
  106.         print len(job_links) - counter
  107.         #print skill_set
  108.  
  109. print skill_set
  110. print j_arr
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
 
Top