Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #we can pretty much just reuse everything.
- # Fixed url for job postings containing data scientist
- url = 'http://www.indeed.com/jobs?q=data+scientist&l='
- # read the website
- source = urllib2.urlopen(url).read()
- # parse html code
- bs_tree = bs4.BeautifulSoup(source)
- # see how many job postings we found
- job_count_string = bs_tree.find(id = 'searchCount').contents[0]
- job_count_string = job_count_string.split()[-2]
- print("Search yielded %s hits." % (job_count_string))
- # print(int(job_count_string.replace(',', '')))
- # not that job_count so far is still a string,
- # not an integer, and the , separator prevents
- # us from just casting it to int
- job_count_digits = [int(d) for d in job_count_string if d.isdigit()]
- job_count = np.sum([digit*(10**exponent) for digit, exponent in
- zip(job_count_digits[::-1], range(len(job_count_digits)))])
- print job_count
- num_pages = int(np.ceil(job_count/10.0))
- base_url = 'http://www.indeed.com/viewjob?'
- job_links = []
- for i in range(100): #do range(num_pages) if you want them all
- url = 'http://www.indeed.com/jobs?q=data+scientist&start=' + str(i*10)
- html_page = urllib2.urlopen(url).read()
- bs_tree = bs4.BeautifulSoup(html_page)
- job_link_area = bs_tree.find(id = 'resultsCol')
- job_postings = job_link_area.findAll("div")
- job_postings = [jp for jp in job_postings if not jp.get('class') is None
- and ''.join(jp.get('class')) =="rowresult"]
- #print(job_postings)
- job_ids = [jp.get('data-jk') for jp in job_postings]
- # go after each link
- for id in job_ids:
- if(len(job_links) == 1000):
- break
- else:
- job_links.append(base_url + 'jk=' + id)
- time.sleep(1)
- skill_set = {
- 'mapreduce':0, 'spark':0, 'visualization':0, 'hadoop':0, 'sql':0,
- 'teradata':0, 'sas':0, 'excel':0, 'access':0, 'java':0,
- 'r':0, 'python':0, 'machine learning':0, 'statistics':0, 'aws':0,
- 'databricks':0, 'nlp':0, 'tableau':0, 'watson':0, 'scikit-learn':0
- }
- counter = 0
- j_arr = []
- for link in job_links:
- counter +=1
- html_page = ""
- try:
- html_page = urllib2.urlopen(link).read()
- except urllib2.HTTPError:
- print "HTTPError:"
- continue
- except urllib2.URLError:
- print "URLError:"
- continue
- except socket.error:
- print "Connection closed"
- continue
- html_text = re.sub("[^a-z.+3]"," ", html_page.lower()) # replace all but the listed characters
- tree2 = bs4.BeautifulSoup(html_page)
- # parse for title, name, location, skills
- header = str(tree2.find('title'))#.replace("<title>", "").replace("</title>", "")
- #print header
- # e.g. "Data Scientist - Reliability job - Tesla - Palo Alto, CA | Indeed.com"
- #"Data Scientist - Reliability job - Tesla - Palo Alto, CA"
- header = header.split("|")[0]
- #"Data Scientist - Reliability"
- job_t = header.split(" job - ")[0].replace("<title>", "")
- #print "t: " + job_t
- #"Tesla"
- job_c = header.split(" job - ")[1].split(" - ")[0]
- #print "c: " + job_c
- #"Palo Alto, CA"
- job_l = header.split(" job - ")[1].split(" - ")[1]
- #print "l: " + job_l
- job_s = []
- for key in skill_set.keys():
- if key in html_text:
- skill_set[key] +=1
- job_s.append(key)
- else:
- job_s.append(None)
- j = tuple([job_t, job_c, job_l] + job_s)
- j_arr.append(j)
- if counter % 5 == 0:
- print len(job_links) - counter
- #print skill_set
- print skill_set
- print j_arr
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement