Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # Scraping
- linkedin_takram_0_50 = []
- for linkedin_url in aaa[0:50]#takram_url_folder:
- # get the profile URL
- driver.get(linkedin_url)
- #add a 20 second pause loading each URL
- sleep(10)
- # assigning the source code for the webpage to variable sel
- sel = Selector(text=driver.page_source)
- #open_show_more_if_exists
- try:
- sel.xpath('//*[starts-with(@class, "pv-experience-section__see-more")]/button')
- print("open_show_more")
- show_more_button = driver.find_element_by_xpath('//*[starts-with(@class, "pv-experience-section__see-more")]/button')
- show_more_button.click()
- sleep(3)
- except:
- print("Didn't_open_show_more")
- #if_Nfor1_exists
- try:
- driver.find_element_by_xpath('//*[starts-with(@class, "pv-entity__company-details")]')
- print("nfor1_exists")
- #work
- nfor1company_history = sel.xpath('//*[starts-with(@class, "pv-entity__company-details")]/div[2]/h3/span[2]/text()').extract() ##
- nfor1position_history = sel.xpath('//*[starts-with(@class, "pv-entity__position-group-role-item")]/div/div/div/div[1]/h3/span[2]/text()').extract_first() ###
- nfor1work_date1 = sel.xpath ('//*[starts-with(@class, "pv-entity__position-group-role-item")]/div/div/div/div/div[1]/h4[1]/span[2]/text()').extract_first()
- company_history = sel.xpath('//*[starts-with(@class, "pv-entity__secondary-title")]/text()').extract()
- position_history = sel.xpath('//*[starts-with(@class, "t-16 t-black t-bold")]/text()').extract()
- work_date1 = sel.xpath('//*[starts-with(@class, "pv-entity__date-range t-14 t-black--light t-normal")]/span[2]//text()').extract()
- #education
- #education_detail には "degree"と"field of study"があり、両方とるとdetail数が2倍になり、career_historyと数が合わなくなるため、degreeのみとる。
- education_history = sel.xpath('//*[starts-with(@class, "pv-entity__school-name t-16 t-black t-bold")]/text()').extract()
- degree_history = sel.xpath('//*[starts-with(@class, "pv-entity__secondary-title pv-entity__degree-name pv-entity__secondary-title t-14 t-black t-normal")]/span[2]/text()').extract()
- s_education_date = sel.xpath('//*[starts-with(@class, "pv-entity__dates t-14 t-black--light t-normal")]/span/time[1]/text()').extract()
- e_education_date = sel.xpath('//*[starts-with(@class, "pv-entity__dates t-14 t-black--light t-normal")]/span/time[2]/text()').extract()
- #name
- name = sel.xpath('//*[starts-with(@class, "pv-top-card-section__name")]/text()').extract_first()
- # validating if the fields exist on the profile
- name = validate_field(name)
- nfor1company_history = validate_field(nfor1company_history) ###
- company_history = validate_field(company_history)
- nfor1position_history = validate_field(nfor1position_history) ###
- position_history = validate_field(position_history)
- nfor1work_date1 = validate_field(nfor1work_date1) ###
- work_date1 = validate_field(work_date1)
- education_history = validate_field(education_history)
- degree_history = validate_field(degree_history)
- s_education_date = validate_field(s_education_date)
- e_education_date = validate_field(e_education_date)
- #data_arrangement
- #company_history
- if len(nfor1company_history) ==0:
- nfor1company_history.append("NaN – NaN") ###
- if len(company_history)==0:
- company_history.append("NaN – NaN")
- while "\n " in company_history:
- company_history.remove("\n ")
- while "\n " in company_history:
- company_history.remove("\n ")
- while "\n " in company_history:
- company_history.remove("\n ")
- #position_history
- if len(nfor1position_history) == 0:
- nfor1position_history.append("NaN – NaN")
- if len(position_history)==0:
- position_history.append("NaN – NaN")
- while "\n " in position_history:
- position_history.remove("\n ")
- while "\n " in position_history:
- position_history.remove("\n ")
- while "\n " in position_history:
- position_history.remove("\n ")
- while "Messaging Settings" in position_history:
- position_history.remove("Messaging Settings")
- #work_date (s_new_work_data, e_new_work_date)
- while len(nfor1company_history) > len(nfor1work_date1):
- nfor1work_date1.append("NaN – NaN")
- while len(company_history) > len(work_date1):
- work_date1.append("NaN – NaN")
- nfor1work_date2 = []
- nfor1work_date2.append(nfor1work_date1)
- work_date2 = []
- for w in work_date1:
- if len(w) == 8:
- w = w + " – NaN"
- if len(w) == 4:
- w = w + " – NaN"
- work_date2.append(w)
- print(nfor1work_date1)
- print(nfor1work_date2)
- print(work_date2)
- nfor1_new_work_date = nfor1work_date2 + work_date2
- print(nfor1_new_work_date)
- new_work_date = []
- for w in nfor1_new_work_date:
- new_work_date.append(w.split(" – "))
- #new_work_dateは二重リスト。replaceはリストには使えないため、forを二回行う。
- new_work_date1 = []
- for d in new_work_date:
- for e in d:
- f = e.replace("Jan", "01 /").replace("Feb", "02 /").replace("Mar", "03 /").replace("Apr", "04 /").replace("May", "05 /").replace("Jun", "06 /").replace("Jul", "07 /").replace("Aug", "08 /").replace("Sep", "09 /").replace("Oct", "10 /").replace("Nov", "11 /").replace("Dec", "12 /")
- new_work_date1.append(f)
- s_new_work_date = new_work_date1[0::2]
- e_new_work_date = new_work_date1[1::2]
- #for i in range(len(new_work_date)):
- #s_new_work_date.append(new_work_date1[0::2])
- #e_new_work_date.append(new_work_date1[1::2])
- #education_history, degree_history
- while len(degree_history) < len(education_history):
- degree_history.append("NaN – NaN")
- #extend(company_history, education_history)
- career_history = nfor1company_history + company_history + education_history
- #extend(position_history, education_degree)
- print(nfor1position_history)
- nfor1position_history_list = []
- nfor1position_history_list.append(nfor1position_history)
- print(position_history)
- print(degree_history)
- detail_history = nfor1position_history_list + position_history + degree_history
- #extend(s_date, e_date)
- s_date = s_new_work_date + s_education_date
- e_date = e_new_work_date + e_education_date
- #name
- if name:
- name = name.strip()
- name_list = []
- name_list.append(name)
- while len(name_list) < (len(s_new_work_date) + len(s_education_date)):
- name_list.append(name)
- url_list =[]
- while len(url_list) < len(name_list):
- url_list.append(linkedin_url)
- print("company_history = {}".format(len(company_history)))
- print(company_history)
- print("position_history = {}".format(len(position_history)))
- print(position_history)
- print("work_date")
- print(work_date1)
- print("work_date2")
- print(work_date2)
- print("new_work_date")
- print(new_work_date)
- print("new_work_date1")
- print(new_work_date1)
- print("education_history = {}".format(len(education_history)))
- print(education_history)
- print("s_new_work_date")
- print(s_new_work_date)
- print("e_new_work_date")
- print(e_new_work_date)
- print("degree_history")
- print(degree_history)
- print("@1@ career_history = {}".format(len(career_history)))
- print(career_history)
- print("@2@ detail_history = {}".format(len(detail_history)))
- print(detail_history)
- print("@3@ s_date = {}".format(len(s_date)))
- print(s_date)
- print("@4@ e_date = {}".format(len(e_date)))
- print(e_date)
- print("@5@ name_list = {}".format(len(name_list)))
- print(name_list)
- print("@6@ url_list = {}".format(len(url_list)))
- print(linkedin_url)
- #data_append
- try:
- for i in range (len(name_list)):
- linkedin_takram_0_50.append([name_list[i], career_history[i], detail_history[i], s_date[i], e_date[i], url_list[i]])
- except IndexError:
- print("Index Error _ Nfo1")
- #if_not_Nfor1_exists
- except:
- print("Not_nfor1_exists")
- #work
- company_history = sel.xpath('//*[starts-with(@class, "pv-entity__secondary-title")]/text()').extract()
- position_history = sel.xpath('//*[starts-with(@class, "t-16 t-black t-bold")]/text()').extract()
- work_date1 = sel.xpath('//*[starts-with(@class, "pv-entity__date-range t-14 t-black--light t-normal")]/span[2]//text()').extract()
- #education
- #education_detail には "degree"と"field of study"があり、両方とるとdetail数が2倍になり、career_historyと数が合わなくなるため、degreeのみとる。
- education_history = sel.xpath('//*[starts-with(@class, "pv-entity__school-name t-16 t-black t-bold")]/text()').extract()
- degree_history = sel.xpath('//*[starts-with(@class, "pv-entity__secondary-title pv-entity__degree-name pv-entity__secondary-title t-14 t-black t-normal")]/span[2]/text()').extract()
- s_education_date = sel.xpath('//*[starts-with(@class, "pv-entity__dates t-14 t-black--light t-normal")]/span/time[1]/text()').extract()
- e_education_date = sel.xpath('//*[starts-with(@class, "pv-entity__dates t-14 t-black--light t-normal")]/span/time[2]/text()').extract()
- #name
- name = sel.xpath('//*[starts-with(@class, "pv-top-card-section__name")]/text()').extract_first()
- # validating if the fields exist on the profile
- name = validate_field(name)
- company_history = validate_field(company_history)
- position_history = validate_field(position_history)
- work_date1 = validate_field(work_date1)
- education_history = validate_field(education_history)
- degree_history = validate_field(degree_history)
- s_education_date = validate_field(s_education_date)
- e_education_date = validate_field(e_education_date)
- #data_arrangement
- #company_history
- if len(company_history)==0:
- company_history.append('NaN – NaN')
- while "\n " in company_history:
- company_history.remove("\n ")
- while "\n " in company_history:
- company_history.remove("\n ")
- while "\n " in company_history:
- company_history.remove("\n ")
- #position_history
- if len(position_history)==0:
- position_history.append("NaN – NaN")
- while "\n " in position_history:
- position_history.remove("\n ")
- while "\n " in position_history:
- position_history.remove("\n ")
- while "\n " in position_history:
- position_history.remove("\n ")
- while "Messaging Settings" in position_history:
- position_history.remove("Messaging Settings")
- #work_date (s_new_work_data, e_new_work_date)
- while len(company_history) > len(work_date1):
- work_date1.append("NaN – NaN")
- work_date2 = []
- for w in work_date1:
- if len(w) == 8:
- w = w + " – NaN"
- if len(w) == 4:
- w = w + " – NaN"
- work_date2.append(w)
- new_work_date = []
- for w in work_date2:
- new_work_date.append(w.split(" – "))
- #new_work_dateは二重リスト。replaceはリストには使えないため、forを二回行う。
- new_work_date1 = []
- for d in new_work_date:
- for e in d:
- f = e.replace("Jan", "01 /").replace("Feb", "02 /").replace("Mar", "03 /").replace("Apr", "04 /").replace("May", "05 /").replace("Jun", "06 /").replace("Jul", "07 /").replace("Aug", "08 /").replace("Sep", "09 /").replace("Oct", "10 /").replace("Nov", "11 /").replace("Dec", "12 /")
- new_work_date1.append(f)
- s_new_work_date = new_work_date1[0::2]
- e_new_work_date = new_work_date1[1::2]
- #education_history, degree_history
- while len(degree_history) < len(education_history):
- degree_history.append("NaN – NaN")
- #extend(company_history, education_history)
- career_history = company_history + education_history
- #extend(position_history, education_degree)
- detail_history = position_history + degree_history
- #extend(s_date, e_date)
- s_date = s_new_work_date + s_education_date
- e_date = e_new_work_date + e_education_date
- #name
- if name:
- name = name.strip()
- name_list = []
- name_list.append(name)
- while len(name_list) < (len(s_new_work_date) + len(s_education_date)):
- name_list.append(name)
- url_list =[]
- while len(url_list) < len(name_list):
- url_list.append(linkedin_url)
- print("company_history = {}".format(len(company_history)))
- print(company_history)
- print("position_history = {}".format(len(position_history)))
- print(position_history)
- print("work_date")
- print(work_date1)
- print("work_date2")
- print(work_date2)
- print("new_work_date")
- print(new_work_date)
- print("new_work_date1")
- print(new_work_date1)
- print("education_history = {}".format(len(education_history)))
- print(education_history)
- print("s_new_work_date")
- print(s_new_work_date)
- print("e_new_work_date")
- print(e_new_work_date)
- print("degree_history")
- print(degree_history)
- print("@1@ career_history = {}".format(len(career_history)))
- print(career_history)
- print("@2@ detail_history = {}".format(len(detail_history)))
- print(detail_history)
- print("@3@ s_date = {}".format(len(s_date)))
- print(s_date)
- print("@4@ e_date = {}".format(len(e_date)))
- print(e_date)
- print("@5@ name_list = {}".format(len(name_list)))
- print(name_list)
- print("@6@ url_list = {}".format(len(url_list)))
- print(linkedin_url)
- #data_append
- try:
- for i in range (len(name_list)):
- linkedin_takram_0_50.append([name_list[i], career_history[i], detail_history[i], s_date[i], e_date[i], url_list[i]])
- except IndexError:
- print("Index Error_not_Nfor1")
- #with open("linkedin_data01.csv", "w", newline="", encoding="utf-8") as f:
- #writer = csv.writer(f, lineterminator ="\n")
- #writer.writerow(['Name', 'company/school', 'position/degree', 'start_date', 'end_date', 'url'])
- #writer.writerows(linkedin_data01)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement