daily pastebin goal
29%
SHARE
TWEET

Untitled

a guest Feb 13th, 2018 80 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. #we can pretty much just reuse everything.
  2.  
  3. # Fixed url for job postings containing data scientist
  4. url = 'http://www.indeed.com/jobs?q=data+scientist&l='
  5. # read the website
  6. source = urllib2.urlopen(url).read()
  7. # parse html code
  8. bs_tree = bs4.BeautifulSoup(source)
  9.  
  10. # see how many job postings we found
  11. job_count_string = bs_tree.find(id = 'searchCount').contents[0]
  12. job_count_string = job_count_string.split()[-2]
  13. print("Search yielded %s hits." % (job_count_string))
  14. # print(int(job_count_string.replace(',', '')))
  15.  
  16. # not that job_count so far is still a string,
  17. # not an integer, and the , separator prevents
  18. # us from just casting it to int
  19.  
  20. job_count_digits = [int(d) for d in job_count_string if d.isdigit()]
  21. job_count = np.sum([digit*(10**exponent) for digit, exponent in
  22.                     zip(job_count_digits[::-1], range(len(job_count_digits)))])
  23.  
  24. print job_count
  25.  
  26. num_pages = int(np.ceil(job_count/10.0))
  27. base_url = 'http://www.indeed.com/viewjob?'
  28. job_links = []
  29. for i in range(100): #do range(num_pages) if you want them all
  30.     url = 'http://www.indeed.com/jobs?q=data+scientist&start=' + str(i*10)
  31.     html_page = urllib2.urlopen(url).read()
  32.     bs_tree = bs4.BeautifulSoup(html_page)
  33.     job_link_area = bs_tree.find(id = 'resultsCol')
  34.     job_postings = job_link_area.findAll("div")
  35.     job_postings = [jp for jp in job_postings if not jp.get('class') is None
  36.                     and ''.join(jp.get('class')) =="rowresult"]
  37.     #print(job_postings)
  38.     job_ids = [jp.get('data-jk') for jp in job_postings]
  39.  
  40.     # go after each link
  41.     for id in job_ids:
  42.         if(len(job_links) == 1000):
  43.             break
  44.         else:
  45.             job_links.append(base_url + 'jk=' + id)
  46.  
  47.         time.sleep(1)
  48.  
  49. skill_set = {
  50.     'mapreduce':0, 'spark':0, 'visualization':0, 'hadoop':0, 'sql':0,
  51.     'teradata':0, 'sas':0, 'excel':0, 'access':0, 'java':0,
  52.     'r':0, 'python':0, 'machine learning':0, 'statistics':0, 'aws':0,
  53.     'databricks':0, 'nlp':0, 'tableau':0, 'watson':0, 'scikit-learn':0
  54. }
  55. counter = 0
  56. j_arr = []
  57. for link in job_links:
  58.     counter +=1  
  59.     html_page = ""
  60.     try:
  61.         html_page = urllib2.urlopen(link).read()
  62.     except urllib2.HTTPError:
  63.         print "HTTPError:"
  64.         continue
  65.     except urllib2.URLError:
  66.         print "URLError:"
  67.         continue
  68.     except socket.error:
  69.         print "Connection closed"
  70.         continue
  71.  
  72.     html_text = re.sub("[^a-z.+3]"," ", html_page.lower()) # replace all but the listed characters
  73.     tree2 = bs4.BeautifulSoup(html_page)
  74.     # parse for title, name, location, skills
  75.     header = str(tree2.find('title'))#.replace("<title>", "").replace("</title>", "")
  76.     #print header
  77.     # e.g. "Data Scientist - Reliability job - Tesla - Palo Alto, CA | Indeed.com"
  78.  
  79.     #"Data Scientist - Reliability job - Tesla - Palo Alto, CA"
  80.     header = header.split("|")[0]
  81.  
  82.     #"Data Scientist - Reliability"
  83.     job_t = header.split(" job - ")[0].replace("<title>", "")
  84.     #print "t: " + job_t
  85.    
  86.     #"Tesla"
  87.     job_c = header.split(" job - ")[1].split(" - ")[0]
  88.     #print "c: " + job_c
  89.    
  90.     #"Palo Alto, CA"
  91.     job_l = header.split(" job - ")[1].split(" - ")[1]
  92.     #print "l: " + job_l
  93.    
  94.     job_s = []
  95.     for key in skill_set.keys():
  96.         if key in html_text:  
  97.             skill_set[key] +=1
  98.             job_s.append(key)
  99.         else:
  100.             job_s.append(None)
  101.    
  102.     j = tuple([job_t, job_c, job_l] + job_s)
  103.     j_arr.append(j)
  104.  
  105.     if counter % 5 == 0:
  106.         print len(job_links) - counter
  107.         #print skill_set
  108.  
  109. print skill_set
  110. print j_arr
RAW Paste Data
Top