indeed


# coding: utf-8

# In[1]:


login_url = 'https://secure.indeed.com/account/login'


# In[2]:


data = {
        'action':'login',
        '__email':'anonymous@gmail.com',
        '__password':'password',
        'remember':'1',
        'hl':'en',
        'continue':'/account/view?hl=en',
       }


# In[3]:


header = {'User_Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36'}


# In[4]:


def listhandler(lst):
    b=[]
    for i in range(0,5):
        try:
            b.append(lst[i])
        except:
            b.append('')
    return b


# In[5]:


def ingester(string):
    from bs4 import BeautifulSoup
    import requests
    from datetime import datetime
    import re
    import time
    from elasticsearch import Elasticsearch
    es = Elasticsearch()
    #start session s
    s = requests.session()
    #login with login and password
    r = s.post(login_url,data = data)
    #get resume page1
    rq= s.get(string)
    url = []
    soup = ''
    all_content=[]
    soup = BeautifulSoup(rq.text,'html.parser')
    content = soup.find_all('a', class_= 'app_link')
    #get urls in a page
    for c in content:
        full = 'https://www.indeed.com'+c.attrs['href']
        url.append(full)
    cnt = len(content)
    track = 0
    #open each url and look for contents
    for j in range(0,cnt):
        track += 1
        r = s.get(url[j],headers=header)
        soup = BeautifulSoup(r.text,'html.parser')
        #Resume name
        name = soup.find('h1').text
        name = name.title()
        print(name)
        #experience
        experiences = soup.find_all('div',class_=re.compile(r'work-experience-section( \w)*'))
        experience_list = []
        company_list=[]
        YOE_list = []
        jobtitle_list=[]
        location_c_list=[]
        if len(experiences) > 5:
            experiences = [experiences[0],experiences[1],experiences[2],experiences[3],experiences[4]]
        for experience in experiences:
            if experience.find('p',class_='work_title title') is None:
                jobtitle = 'No Job Title'
            else:
                jobtitle = experience.find('p',class_='work_title title').text
            if experience.find('div',class_='work_company') is None:
                comp_lo = 'No Company'
            else:
                comp_lo = experience.find('div',class_='work_company').text.split(' - ')
            if len(comp_lo) == 2:
                company,location_c = experience.find('div',class_='work_company').text.split(' - ')
            elif len(comp_lo) ==1 :
                company = experience.find('div',class_='work_company').text
                location_c = 'No Company Location'
            else:
                company = 'No Company'
            if experience.find('p',class_='work_dates') is not None:
                YOE = 0
                period = experience.find('p',class_='work_dates').text.split(' to ')
                #print (period)   # AA BB 1, AA PRESENT 2 A PRESENT 3 A BB 4 AA B 5
                if len(period) == 2:
                    if period[1] == 'Present':
                        try:
                            period[1] = datetime.today().strftime('%B %Y')
                        except:
                            period[1] = datetime.today().strftime('%b %Y')
                    if len(period[0]) > 4 and len(period[1]) > 4:
                        #type AA BB
                        try:
                            prd = (datetime.strptime(period[1],'%B %Y').date()-                            datetime.strptime(period[0],'%B %Y').date())
                        except:
                            prd = (datetime.strptime(period[1],'%b %Y').date()-                            datetime.strptime(period[0],'%b %Y').date())

                        YOE = int(abs(round(prd.days/365,0)))
                    elif len(period[0]) > 4 and len(period[1]) < 5:
                        #type AA B
                        YOE = int(period[1])-int(period[0].split()[1])
                    elif len(period[0]) < 5 and len(period[1]) > 4:
                        #type A BB
                        YOE = int(period[1].split()[1])-int(period[0])
                    elif len(period[0]) < 5 and len(period[1]) < 5:
                        #type A B
                        YOE = int(period[1])-int(period[0])
                else:
                    YOE = 1
                if YOE < 1:
                    YOE = 0
            else:
                YOE = None
            jobdesc = experience.find('p',class_='work_description')
            if jobdesc is None:
                desc = 'No job description'
            else:
                desc = jobdesc.text
            #print(jobtitle,'\n',company,location_c,'\n',desc,'\n',YOE,'year')
            company_list.append(company)
            YOE_list.append(YOE)
            experience_list.append(desc)
            jobtitle_list.append(jobtitle)
            location_c_list.append(location_c)
        #break
            #education education-section last
        educations = soup.find_all('div',class_=re.compile(r'education-section( \w)*'))
        school_list=[]
        YOEdu_list = []
        degree_list=[]
        location_list = []
        if len(educations) > 5:
            educations = [educations[0],educations[1],educations[2],educations[3],educations[4]]
        for e in educations:
            if e.find('p', class_ = 'edu_title') is None:
                degree = 'No Education Information'
            else:
                degree = e.find('p', class_ = 'edu_title').text
            if e.find('div', class_='edu_school') is None:
                edu_school = 'No School Information'
            else:
                edu_school = e.find('div',class_='edu_school').text.split(' - ')[0]
            if e.find('div',class_='inline-block') is None:
                edu_location = 'No Education Location Info'
            else:
                edu_location = e.find('div',class_='inline-block').text
            if e.find('p',class_='edu_dates') is not None:
                YOEdu = 0
                edu_period = e.find('p',class_='edu_dates').text.split(' to ')
                print (edu_period)   # AA BB 1, AA PRESENT 2 A PRESENT 3 A BB 4 AA B 5
                if len(edu_period) == 2:
                    if edu_period[1] == 'Present':
                        try:
                            edu_period[1] = datetime.today().strftime('%B %Y')
                        except:
                            edu_period[1] = datetime.today().strftime('%b %Y')
                    if len(edu_period[0]) > 4 and len(edu_period[1]) > 4:
                        #type AA BB
                        try:
                            edu_prd = (datetime.strptime(edu_period[1],'%B %Y').date()-                            datetime.strptime(edu_period[0],'%B %Y').date())
                        except:
                            edu_prd = (datetime.strptime(edu_period[1],'%b %Y').date()-                            datetime.strptime(period[0],'%b %Y').date())

                        YOEdu = int(abs(round(edu_prd.days/365,0)))
                    elif len(edu_period[0]) > 4 and len(edu_period[1]) < 5:
                        #type AA B
                        YOEdu = int(edu_period[1])-int(edu_period[0].split()[1])
                    elif len(edu_period[0]) < 5 and len(edu_period[1]) > 4:
                        #type A BB
                        YOEdu = int(edu_period[1].split()[1])-int(edu_period[0])
                    elif len(edu_period[0]) < 5 and len(edu_period[1]) < 5:
                        #type A B
                        YOEdu = int(edu_period[1])-int(edu_period[0])
                else:
                    YOEdu = 1
                if YOEdu < 1:
                    YOEdu = 0
            else:
                YOEdu = None
            #print(degree,'\n',edu_school,edu_location,'\n',YOEdu,'year')
            school_list.append(edu_school)
            YOEdu_list.append(YOEdu)
            degree_list.append(degree)
            location_list.append(edu_location)

            #Skills
        skill_set = soup.find_all('span',class_='skill-text')
        skills=''
        for sk in skill_set:
            skills += (sk.text)
        #print(skills)
            #Certifications/Licenses

        cert = soup.find_all('div',class_=re.compile(r'certification-section( \w)*'))
        certs = ''
        for c in cert:
            certs += (c.text)
        #prepare list
        company_list = listhandler(company_list)
        YOE_list = listhandler(YOE_list)
        experience_list = listhandler(experience_list)
        jobtitle_list = listhandler(jobtitle_list)
        location_c_list = listhandler(location_c_list)
        school_list = listhandler(school_list)
        YOEdu_list = listhandler(YOEdu_list)
        degree_list = listhandler(degree_list)
        location_list = listhandler(location_list)
        es.index(index='resume',
                    doc_type='test-type',
                    body={'id': track,
                        'applicant':name,
                         "experience": {
                             "company_1":company_list[0],
                             "title_1":jobtitle_list[0],
                             "years of experience_1":YOE_list[0],
                             "location_1":location_c_list[0],
                             "description_1":experience_list[0],
                             "company_2":company_list[1],
                             "title_2":jobtitle_list[1],
                             "years of experience_2":YOE_list[1],
                             "location_2":location_c_list[1],
                             "description_2":experience_list[1],
                             "company_3":company_list[2],
                             "title_3":jobtitle_list[2],
                             "years of experience_3":YOE_list[2],
                             "location_3":location_c_list[2],
                             "description_3":experience_list[2],
                             "company_4":company_list[3],
                             "title_1":jobtitle_list[3],
                             "years of experience_4":YOE_list[3],
                             "location_4":location_c_list[3],
                             "description_4":experience_list[3],
                             "company_5":company_list[4],
                             "title_5":jobtitle_list[4],
                             "years of experience_5":YOE_list[4],
                             "location_5":location_c_list[4],
                             "description_5":experience_list[4]
                        },
                        "education":{
                            "school_1":school_list[0],
                            "degree_1":degree_list[0],
                            "location_1":location_list[0],
                            "yearsOfeducation_1":YOEdu_list[0],
                            "school_2":school_list[1],
                            "degree_2":degree_list[1],
                            "location_2":location_list[1],
                            "yearsOfeducation_2":YOEdu_list[1],
                            "school_3":school_list[2],
                            "degree_3":degree_list[2],
                            "location_3":location_list[2],
                            "yearsOfeducation_3":YOEdu_list[2],
                            "school_4":school_list[3],
                            "degree_4":degree_list[3],
                            "location_4":location_list[3],
                            "yearsOfeducation_4":YOEdu_list[3],
                            "school_5":school_list[4],
                            "degree_5":degree_list[4],
                            "location_5":location_list[4],
                            "yearsOfeducation_5":YOEdu_list[4]
                        },
                        "skills":skills,
                        "certifications":certs})
        time.sleep(2)
    return True
        #break
            #Adwards

            #Additional Information


# In[6]:


url_base = 'https://www.indeed.com/resumes?q=data+scientist&l=GTA%2C+ON&co=CA&cb=jt&start='
for i in range(0,6):
    print('working on page: %d' %(i+1))
    ingester(url_base+str(i*50))


# In[ ]:


from elasticsearch import Elasticsearch
es = Elasticsearch()
es.indices.delete(index='resume')


# In[ ]:


import requests
from bs4 import BeautifulSoup
r = requests.get('https://www.indeed.com/resumes?q=data+scientist&l=GTA%2C+ON&co=CA&cb=jt&start=')
soup = BeautifulSoup(r.text,'html.parser')
content = soup.find_all('a', class_= 'app_link')


# In[ ]:


len(content)


# In[ ]: