Untitled

#we can pretty much just reuse everything.

# Fixed url for job postings containing data scientist
url = 'http://www.indeed.com/jobs?q=data+scientist&l='
# read the website
source = urllib2.urlopen(url).read()
# parse html code
bs_tree = bs4.BeautifulSoup(source)

# see how many job postings we found
job_count_string = bs_tree.find(id = 'searchCount').contents[0]
job_count_string = job_count_string.split()[-2]
print("Search yielded %s hits." % (job_count_string))
# print(int(job_count_string.replace(',', '')))

# not that job_count so far is still a string,
# not an integer, and the , separator prevents
# us from just casting it to int

job_count_digits = [int(d) for d in job_count_string if d.isdigit()]
job_count = np.sum([digit*(10**exponent) for digit, exponent in
                    zip(job_count_digits[::-1], range(len(job_count_digits)))])

print job_count

num_pages = int(np.ceil(job_count/10.0))
base_url = 'http://www.indeed.com/viewjob?'
job_links = []
for i in range(100): #do range(num_pages) if you want them all
    url = 'http://www.indeed.com/jobs?q=data+scientist&start=' + str(i*10)
    html_page = urllib2.urlopen(url).read()
    bs_tree = bs4.BeautifulSoup(html_page)
    job_link_area = bs_tree.find(id = 'resultsCol')
    job_postings = job_link_area.findAll("div")
    job_postings = [jp for jp in job_postings if not jp.get('class') is None
                    and ''.join(jp.get('class')) =="rowresult"]
    #print(job_postings)
    job_ids = [jp.get('data-jk') for jp in job_postings]

    # go after each link
    for id in job_ids:
        if(len(job_links) == 1000):
            break
        else:
            job_links.append(base_url + 'jk=' + id)

        time.sleep(1)

skill_set = {
    'mapreduce':0, 'spark':0, 'visualization':0, 'hadoop':0, 'sql':0,
    'teradata':0, 'sas':0, 'excel':0, 'access':0, 'java':0,
    'r':0, 'python':0, 'machine learning':0, 'statistics':0, 'aws':0,
    'databricks':0, 'nlp':0, 'tableau':0, 'watson':0, 'scikit-learn':0
}
counter = 0
j_arr = []
for link in job_links:
    counter +=1
    html_page = ""
    try:
        html_page = urllib2.urlopen(link).read()
    except urllib2.HTTPError:
        print "HTTPError:"
        continue
    except urllib2.URLError:
        print "URLError:"
        continue
    except socket.error:
        print "Connection closed"
        continue

    html_text = re.sub("[^a-z.+3]"," ", html_page.lower()) # replace all but the listed characters
    tree2 = bs4.BeautifulSoup(html_page)
    # parse for title, name, location, skills
    header = str(tree2.find('title'))#.replace("<title>", "").replace("</title>", "")
    #print header
    # e.g. "Data Scientist - Reliability job - Tesla - Palo Alto, CA | Indeed.com"

    #"Data Scientist - Reliability job - Tesla - Palo Alto, CA"
    header = header.split("|")[0]

    #"Data Scientist - Reliability"
    job_t = header.split(" job - ")[0].replace("<title>", "")
    #print "t: " + job_t

    #"Tesla"
    job_c = header.split(" job - ")[1].split(" - ")[0]
    #print "c: " + job_c

    #"Palo Alto, CA"
    job_l = header.split(" job - ")[1].split(" - ")[1]
    #print "l: " + job_l

    job_s = []
    for key in skill_set.keys():
        if key in html_text:
            skill_set[key] +=1
            job_s.append(key)
        else:
            job_s.append(None)

    j = tuple([job_t, job_c, job_l] + job_s)
    j_arr.append(j)

    if counter % 5 == 0:
        print len(job_links) - counter
        #print skill_set

print skill_set
print j_arr