Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # coding: utf-8
- # In[1]:
- login_url = 'https://secure.indeed.com/account/login'
- # In[2]:
- data = {
- 'action':'login',
- '__email':'anonymous@gmail.com',
- '__password':'password',
- 'remember':'1',
- 'hl':'en',
- 'continue':'/account/view?hl=en',
- }
- # In[3]:
- header = {'User_Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36'}
- # In[4]:
- def listhandler(lst):
- b=[]
- for i in range(0,5):
- try:
- b.append(lst[i])
- except:
- b.append('')
- return b
- # In[5]:
- def ingester(string):
- from bs4 import BeautifulSoup
- import requests
- from datetime import datetime
- import re
- import time
- from elasticsearch import Elasticsearch
- es = Elasticsearch()
- #start session s
- s = requests.session()
- #login with login and password
- r = s.post(login_url,data = data)
- #get resume page1
- rq= s.get(string)
- url = []
- soup = ''
- all_content=[]
- soup = BeautifulSoup(rq.text,'html.parser')
- content = soup.find_all('a', class_= 'app_link')
- #get urls in a page
- for c in content:
- full = 'https://www.indeed.com'+c.attrs['href']
- url.append(full)
- cnt = len(content)
- track = 0
- #open each url and look for contents
- for j in range(0,cnt):
- track += 1
- r = s.get(url[j],headers=header)
- soup = BeautifulSoup(r.text,'html.parser')
- #Resume name
- name = soup.find('h1').text
- name = name.title()
- print(name)
- #experience
- experiences = soup.find_all('div',class_=re.compile(r'work-experience-section( \w)*'))
- experience_list = []
- company_list=[]
- YOE_list = []
- jobtitle_list=[]
- location_c_list=[]
- if len(experiences) > 5:
- experiences = [experiences[0],experiences[1],experiences[2],experiences[3],experiences[4]]
- for experience in experiences:
- if experience.find('p',class_='work_title title') is None:
- jobtitle = 'No Job Title'
- else:
- jobtitle = experience.find('p',class_='work_title title').text
- if experience.find('div',class_='work_company') is None:
- comp_lo = 'No Company'
- else:
- comp_lo = experience.find('div',class_='work_company').text.split(' - ')
- if len(comp_lo) == 2:
- company,location_c = experience.find('div',class_='work_company').text.split(' - ')
- elif len(comp_lo) ==1 :
- company = experience.find('div',class_='work_company').text
- location_c = 'No Company Location'
- else:
- company = 'No Company'
- if experience.find('p',class_='work_dates') is not None:
- YOE = 0
- period = experience.find('p',class_='work_dates').text.split(' to ')
- #print (period) # AA BB 1, AA PRESENT 2 A PRESENT 3 A BB 4 AA B 5
- if len(period) == 2:
- if period[1] == 'Present':
- try:
- period[1] = datetime.today().strftime('%B %Y')
- except:
- period[1] = datetime.today().strftime('%b %Y')
- if len(period[0]) > 4 and len(period[1]) > 4:
- #type AA BB
- try:
- prd = (datetime.strptime(period[1],'%B %Y').date()- datetime.strptime(period[0],'%B %Y').date())
- except:
- prd = (datetime.strptime(period[1],'%b %Y').date()- datetime.strptime(period[0],'%b %Y').date())
- YOE = int(abs(round(prd.days/365,0)))
- elif len(period[0]) > 4 and len(period[1]) < 5:
- #type AA B
- YOE = int(period[1])-int(period[0].split()[1])
- elif len(period[0]) < 5 and len(period[1]) > 4:
- #type A BB
- YOE = int(period[1].split()[1])-int(period[0])
- elif len(period[0]) < 5 and len(period[1]) < 5:
- #type A B
- YOE = int(period[1])-int(period[0])
- else:
- YOE = 1
- if YOE < 1:
- YOE = 0
- else:
- YOE = None
- jobdesc = experience.find('p',class_='work_description')
- if jobdesc is None:
- desc = 'No job description'
- else:
- desc = jobdesc.text
- #print(jobtitle,'\n',company,location_c,'\n',desc,'\n',YOE,'year')
- company_list.append(company)
- YOE_list.append(YOE)
- experience_list.append(desc)
- jobtitle_list.append(jobtitle)
- location_c_list.append(location_c)
- #break
- #education education-section last
- educations = soup.find_all('div',class_=re.compile(r'education-section( \w)*'))
- school_list=[]
- YOEdu_list = []
- degree_list=[]
- location_list = []
- if len(educations) > 5:
- educations = [educations[0],educations[1],educations[2],educations[3],educations[4]]
- for e in educations:
- if e.find('p', class_ = 'edu_title') is None:
- degree = 'No Education Information'
- else:
- degree = e.find('p', class_ = 'edu_title').text
- if e.find('div', class_='edu_school') is None:
- edu_school = 'No School Information'
- else:
- edu_school = e.find('div',class_='edu_school').text.split(' - ')[0]
- if e.find('div',class_='inline-block') is None:
- edu_location = 'No Education Location Info'
- else:
- edu_location = e.find('div',class_='inline-block').text
- if e.find('p',class_='edu_dates') is not None:
- YOEdu = 0
- edu_period = e.find('p',class_='edu_dates').text.split(' to ')
- print (edu_period) # AA BB 1, AA PRESENT 2 A PRESENT 3 A BB 4 AA B 5
- if len(edu_period) == 2:
- if edu_period[1] == 'Present':
- try:
- edu_period[1] = datetime.today().strftime('%B %Y')
- except:
- edu_period[1] = datetime.today().strftime('%b %Y')
- if len(edu_period[0]) > 4 and len(edu_period[1]) > 4:
- #type AA BB
- try:
- edu_prd = (datetime.strptime(edu_period[1],'%B %Y').date()- datetime.strptime(edu_period[0],'%B %Y').date())
- except:
- edu_prd = (datetime.strptime(edu_period[1],'%b %Y').date()- datetime.strptime(period[0],'%b %Y').date())
- YOEdu = int(abs(round(edu_prd.days/365,0)))
- elif len(edu_period[0]) > 4 and len(edu_period[1]) < 5:
- #type AA B
- YOEdu = int(edu_period[1])-int(edu_period[0].split()[1])
- elif len(edu_period[0]) < 5 and len(edu_period[1]) > 4:
- #type A BB
- YOEdu = int(edu_period[1].split()[1])-int(edu_period[0])
- elif len(edu_period[0]) < 5 and len(edu_period[1]) < 5:
- #type A B
- YOEdu = int(edu_period[1])-int(edu_period[0])
- else:
- YOEdu = 1
- if YOEdu < 1:
- YOEdu = 0
- else:
- YOEdu = None
- #print(degree,'\n',edu_school,edu_location,'\n',YOEdu,'year')
- school_list.append(edu_school)
- YOEdu_list.append(YOEdu)
- degree_list.append(degree)
- location_list.append(edu_location)
- #Skills
- skill_set = soup.find_all('span',class_='skill-text')
- skills=''
- for sk in skill_set:
- skills += (sk.text)
- #print(skills)
- #Certifications/Licenses
- cert = soup.find_all('div',class_=re.compile(r'certification-section( \w)*'))
- certs = ''
- for c in cert:
- certs += (c.text)
- #prepare list
- company_list = listhandler(company_list)
- YOE_list = listhandler(YOE_list)
- experience_list = listhandler(experience_list)
- jobtitle_list = listhandler(jobtitle_list)
- location_c_list = listhandler(location_c_list)
- school_list = listhandler(school_list)
- YOEdu_list = listhandler(YOEdu_list)
- degree_list = listhandler(degree_list)
- location_list = listhandler(location_list)
- es.index(index='resume',
- doc_type='test-type',
- body={'id': track,
- 'applicant':name,
- "experience": {
- "company_1":company_list[0],
- "title_1":jobtitle_list[0],
- "years of experience_1":YOE_list[0],
- "location_1":location_c_list[0],
- "description_1":experience_list[0],
- "company_2":company_list[1],
- "title_2":jobtitle_list[1],
- "years of experience_2":YOE_list[1],
- "location_2":location_c_list[1],
- "description_2":experience_list[1],
- "company_3":company_list[2],
- "title_3":jobtitle_list[2],
- "years of experience_3":YOE_list[2],
- "location_3":location_c_list[2],
- "description_3":experience_list[2],
- "company_4":company_list[3],
- "title_1":jobtitle_list[3],
- "years of experience_4":YOE_list[3],
- "location_4":location_c_list[3],
- "description_4":experience_list[3],
- "company_5":company_list[4],
- "title_5":jobtitle_list[4],
- "years of experience_5":YOE_list[4],
- "location_5":location_c_list[4],
- "description_5":experience_list[4]
- },
- "education":{
- "school_1":school_list[0],
- "degree_1":degree_list[0],
- "location_1":location_list[0],
- "yearsOfeducation_1":YOEdu_list[0],
- "school_2":school_list[1],
- "degree_2":degree_list[1],
- "location_2":location_list[1],
- "yearsOfeducation_2":YOEdu_list[1],
- "school_3":school_list[2],
- "degree_3":degree_list[2],
- "location_3":location_list[2],
- "yearsOfeducation_3":YOEdu_list[2],
- "school_4":school_list[3],
- "degree_4":degree_list[3],
- "location_4":location_list[3],
- "yearsOfeducation_4":YOEdu_list[3],
- "school_5":school_list[4],
- "degree_5":degree_list[4],
- "location_5":location_list[4],
- "yearsOfeducation_5":YOEdu_list[4]
- },
- "skills":skills,
- "certifications":certs})
- time.sleep(2)
- return True
- #break
- #Adwards
- #Additional Information
- # In[6]:
- url_base = 'https://www.indeed.com/resumes?q=data+scientist&l=GTA%2C+ON&co=CA&cb=jt&start='
- for i in range(0,6):
- print('working on page: %d' %(i+1))
- ingester(url_base+str(i*50))
- # In[ ]:
- from elasticsearch import Elasticsearch
- es = Elasticsearch()
- es.indices.delete(index='resume')
- # In[ ]:
- import requests
- from bs4 import BeautifulSoup
- r = requests.get('https://www.indeed.com/resumes?q=data+scientist&l=GTA%2C+ON&co=CA&cb=jt&start=')
- soup = BeautifulSoup(r.text,'html.parser')
- content = soup.find_all('a', class_= 'app_link')
- # In[ ]:
- len(content)
- # In[ ]:
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement