Untitled

# Scraping

linkedin_takram_0_50 = []

for linkedin_url in aaa[0:50]#takram_url_folder:
     # get the profile URL
    driver.get(linkedin_url)
    #add a 20 second pause loading each URL
    sleep(10)
    # assigning the source code for the webpage to variable sel
    sel = Selector(text=driver.page_source)

#open_show_more_if_exists
    try:
        sel.xpath('//*[starts-with(@class, "pv-experience-section__see-more")]/button')
        print("open_show_more")
        show_more_button = driver.find_element_by_xpath('//*[starts-with(@class, "pv-experience-section__see-more")]/button')
        show_more_button.click()
        sleep(3)
    except:
        print("Didn't_open_show_more")

    #if_Nfor1_exists
    try:
        driver.find_element_by_xpath('//*[starts-with(@class, "pv-entity__company-details")]')
        print("nfor1_exists")
        #work
        nfor1company_history = sel.xpath('//*[starts-with(@class, "pv-entity__company-details")]/div[2]/h3/span[2]/text()').extract() ##
        nfor1position_history = sel.xpath('//*[starts-with(@class, "pv-entity__position-group-role-item")]/div/div/div/div[1]/h3/span[2]/text()').extract_first() ###
        nfor1work_date1 = sel.xpath ('//*[starts-with(@class, "pv-entity__position-group-role-item")]/div/div/div/div/div[1]/h4[1]/span[2]/text()').extract_first()
        company_history = sel.xpath('//*[starts-with(@class, "pv-entity__secondary-title")]/text()').extract()
        position_history = sel.xpath('//*[starts-with(@class, "t-16 t-black t-bold")]/text()').extract()
        work_date1 = sel.xpath('//*[starts-with(@class, "pv-entity__date-range t-14 t-black--light t-normal")]/span[2]//text()').extract()

        #education
        #education_detail には "degree"と"field of study"があり、両方とるとdetail数が2倍になり、career_historyと数が合わなくなるため、degreeのみとる。
        education_history = sel.xpath('//*[starts-with(@class, "pv-entity__school-name t-16 t-black t-bold")]/text()').extract()
        degree_history = sel.xpath('//*[starts-with(@class, "pv-entity__secondary-title pv-entity__degree-name pv-entity__secondary-title t-14 t-black t-normal")]/span[2]/text()').extract()
        s_education_date = sel.xpath('//*[starts-with(@class, "pv-entity__dates t-14 t-black--light t-normal")]/span/time[1]/text()').extract()
        e_education_date = sel.xpath('//*[starts-with(@class, "pv-entity__dates t-14 t-black--light t-normal")]/span/time[2]/text()').extract()

        #name
        name = sel.xpath('//*[starts-with(@class, "pv-top-card-section__name")]/text()').extract_first()

        # validating if the fields exist on the profile
        name = validate_field(name)
        nfor1company_history = validate_field(nfor1company_history) ###
        company_history = validate_field(company_history)
        nfor1position_history = validate_field(nfor1position_history) ###
        position_history = validate_field(position_history)
        nfor1work_date1 = validate_field(nfor1work_date1) ###
        work_date1 = validate_field(work_date1)
        education_history = validate_field(education_history)
        degree_history = validate_field(degree_history)
        s_education_date = validate_field(s_education_date)
        e_education_date = validate_field(e_education_date)

        #data_arrangement

        #company_history

        if len(nfor1company_history) ==0:
            nfor1company_history.append("NaN – NaN") ###

        if len(company_history)==0:
            company_history.append("NaN – NaN")

        while "\n        " in company_history:
            company_history.remove("\n        ")

        while "\n      " in company_history:
            company_history.remove("\n      ")

        while "\n    " in company_history:
            company_history.remove("\n    ")

        #position_history

        if len(nfor1position_history) == 0:
            nfor1position_history.append("NaN – NaN")

        if len(position_history)==0:
            position_history.append("NaN – NaN")

        while "\n        " in position_history:
            position_history.remove("\n        ")

        while "\n      " in position_history:
            position_history.remove("\n      ")

        while "\n    " in position_history:
            position_history.remove("\n    ")

        while "Messaging Settings" in position_history:
            position_history.remove("Messaging Settings")

        #work_date (s_new_work_data, e_new_work_date)

        while len(nfor1company_history) > len(nfor1work_date1):
            nfor1work_date1.append("NaN – NaN")

        while len(company_history) > len(work_date1):
            work_date1.append("NaN – NaN")

        nfor1work_date2 = []
        nfor1work_date2.append(nfor1work_date1)

        work_date2 = []
        for w in work_date1:
            if len(w) == 8:
                w = w + " – NaN"
            if len(w) == 4:
                w = w + " – NaN"
            work_date2.append(w)


        print(nfor1work_date1)
        print(nfor1work_date2)
        print(work_date2)
        nfor1_new_work_date = nfor1work_date2 + work_date2
        print(nfor1_new_work_date)

        new_work_date = []
        for w in nfor1_new_work_date:
            new_work_date.append(w.split(" – "))

        #new_work_dateは二重リスト。replaceはリストには使えないため、forを二回行う。
        new_work_date1 = []
        for d in new_work_date:
            for e in d:
                f = e.replace("Jan", "01 /").replace("Feb", "02 /").replace("Mar", "03 /").replace("Apr", "04 /").replace("May", "05 /").replace("Jun", "06 /").replace("Jul", "07 /").replace("Aug", "08 /").replace("Sep", "09 /").replace("Oct", "10 /").replace("Nov", "11 /").replace("Dec", "12 /")
                new_work_date1.append(f)

        s_new_work_date = new_work_date1[0::2]
        e_new_work_date = new_work_date1[1::2]

        #for i in range(len(new_work_date)):
            #s_new_work_date.append(new_work_date1[0::2])
            #e_new_work_date.append(new_work_date1[1::2])

        #education_history, degree_history
        while len(degree_history) < len(education_history):
            degree_history.append("NaN – NaN")

        #extend(company_history, education_history)

        career_history = nfor1company_history + company_history + education_history

        #extend(position_history, education_degree)
        print(nfor1position_history)
        nfor1position_history_list = []
        nfor1position_history_list.append(nfor1position_history)
        print(position_history)
        print(degree_history)
        detail_history = nfor1position_history_list + position_history + degree_history

        #extend(s_date, e_date)

        s_date = s_new_work_date + s_education_date
        e_date = e_new_work_date + e_education_date

        #name

        if name:
            name = name.strip()

        name_list = []
        name_list.append(name)

        while len(name_list) < (len(s_new_work_date) + len(s_education_date)):
            name_list.append(name)

        url_list =[]
        while len(url_list) < len(name_list):
            url_list.append(linkedin_url)

        print("company_history = {}".format(len(company_history)))
        print(company_history)

        print("position_history = {}".format(len(position_history)))
        print(position_history)

        print("work_date")
        print(work_date1)

        print("work_date2")
        print(work_date2)

        print("new_work_date")
        print(new_work_date)

        print("new_work_date1")
        print(new_work_date1)

        print("education_history = {}".format(len(education_history)))
        print(education_history)

        print("s_new_work_date")
        print(s_new_work_date)

        print("e_new_work_date")
        print(e_new_work_date)

        print("degree_history")
        print(degree_history)

        print("@1@ career_history = {}".format(len(career_history)))
        print(career_history)

        print("@2@ detail_history = {}".format(len(detail_history)))
        print(detail_history)

        print("@3@ s_date = {}".format(len(s_date)))
        print(s_date)

        print("@4@ e_date = {}".format(len(e_date)))
        print(e_date)

        print("@5@ name_list = {}".format(len(name_list)))
        print(name_list)

        print("@6@ url_list = {}".format(len(url_list)))
        print(linkedin_url)

        #data_append
        try:
            for i in range (len(name_list)):
                linkedin_takram_0_50.append([name_list[i], career_history[i], detail_history[i], s_date[i], e_date[i], url_list[i]])
        except IndexError:
            print("Index Error _ Nfo1")

    #if_not_Nfor1_exists
    except:
        print("Not_nfor1_exists")
        #work
        company_history = sel.xpath('//*[starts-with(@class, "pv-entity__secondary-title")]/text()').extract()
        position_history = sel.xpath('//*[starts-with(@class, "t-16 t-black t-bold")]/text()').extract()
        work_date1 = sel.xpath('//*[starts-with(@class, "pv-entity__date-range t-14 t-black--light t-normal")]/span[2]//text()').extract()

        #education
        #education_detail には "degree"と"field of study"があり、両方とるとdetail数が2倍になり、career_historyと数が合わなくなるため、degreeのみとる。
        education_history = sel.xpath('//*[starts-with(@class, "pv-entity__school-name t-16 t-black t-bold")]/text()').extract()
        degree_history = sel.xpath('//*[starts-with(@class, "pv-entity__secondary-title pv-entity__degree-name pv-entity__secondary-title t-14 t-black t-normal")]/span[2]/text()').extract()
        s_education_date = sel.xpath('//*[starts-with(@class, "pv-entity__dates t-14 t-black--light t-normal")]/span/time[1]/text()').extract()
        e_education_date = sel.xpath('//*[starts-with(@class, "pv-entity__dates t-14 t-black--light t-normal")]/span/time[2]/text()').extract()

        #name
        name = sel.xpath('//*[starts-with(@class, "pv-top-card-section__name")]/text()').extract_first()

    # validating if the fields exist on the profile
        name = validate_field(name)
        company_history = validate_field(company_history)
        position_history = validate_field(position_history)
        work_date1 = validate_field(work_date1)
        education_history = validate_field(education_history)
        degree_history = validate_field(degree_history)
        s_education_date = validate_field(s_education_date)
        e_education_date = validate_field(e_education_date)

        #data_arrangement

        #company_history

        if len(company_history)==0:
            company_history.append('NaN – NaN')

        while "\n        " in company_history:
            company_history.remove("\n        ")

        while "\n      " in company_history:
            company_history.remove("\n      ")

        while "\n    " in company_history:
            company_history.remove("\n    ")

        #position_history

        if len(position_history)==0:
            position_history.append("NaN – NaN")

        while "\n        " in position_history:
            position_history.remove("\n        ")

        while "\n      " in position_history:
            position_history.remove("\n      ")

        while "\n    " in position_history:
            position_history.remove("\n    ")

        while "Messaging Settings" in position_history:
            position_history.remove("Messaging Settings")

        #work_date (s_new_work_data, e_new_work_date)

        while len(company_history) > len(work_date1):
            work_date1.append("NaN – NaN")

        work_date2 = []
        for w in work_date1:
            if len(w) == 8:
                w = w + " – NaN"
            if len(w) == 4:
                w = w + " – NaN"
            work_date2.append(w)

        new_work_date = []
        for w in work_date2:
            new_work_date.append(w.split(" – "))

        #new_work_dateは二重リスト。replaceはリストには使えないため、forを二回行う。
        new_work_date1 = []
        for d in new_work_date:
            for e in d:
                f = e.replace("Jan", "01 /").replace("Feb", "02 /").replace("Mar", "03 /").replace("Apr", "04 /").replace("May", "05 /").replace("Jun", "06 /").replace("Jul", "07 /").replace("Aug", "08 /").replace("Sep", "09 /").replace("Oct", "10 /").replace("Nov", "11 /").replace("Dec", "12 /")
                new_work_date1.append(f)

        s_new_work_date = new_work_date1[0::2]
        e_new_work_date = new_work_date1[1::2]

      #education_history, degree_history
        while len(degree_history) < len(education_history):
            degree_history.append("NaN – NaN")

        #extend(company_history, education_history)

        career_history = company_history + education_history

        #extend(position_history, education_degree)
        detail_history = position_history + degree_history

        #extend(s_date, e_date)

        s_date = s_new_work_date + s_education_date
        e_date = e_new_work_date + e_education_date

        #name

        if name:
            name = name.strip()

        name_list = []
        name_list.append(name)

        while len(name_list) < (len(s_new_work_date) + len(s_education_date)):
            name_list.append(name)

        url_list =[]
        while len(url_list) < len(name_list):
            url_list.append(linkedin_url)

        print("company_history = {}".format(len(company_history)))
        print(company_history)

        print("position_history = {}".format(len(position_history)))
        print(position_history)

        print("work_date")
        print(work_date1)

        print("work_date2")
        print(work_date2)

        print("new_work_date")
        print(new_work_date)

        print("new_work_date1")
        print(new_work_date1)

        print("education_history = {}".format(len(education_history)))
        print(education_history)

        print("s_new_work_date")
        print(s_new_work_date)

        print("e_new_work_date")
        print(e_new_work_date)

        print("degree_history")
        print(degree_history)

        print("@1@ career_history = {}".format(len(career_history)))
        print(career_history)

        print("@2@ detail_history = {}".format(len(detail_history)))
        print(detail_history)

        print("@3@ s_date = {}".format(len(s_date)))
        print(s_date)

        print("@4@ e_date = {}".format(len(e_date)))
        print(e_date)

        print("@5@ name_list = {}".format(len(name_list)))
        print(name_list)

        print("@6@ url_list = {}".format(len(url_list)))
        print(linkedin_url)

        #data_append
        try:
            for i in range (len(name_list)):
                linkedin_takram_0_50.append([name_list[i], career_history[i], detail_history[i], s_date[i], e_date[i], url_list[i]])
        except IndexError:
            print("Index Error_not_Nfor1")

#with open("linkedin_data01.csv", "w", newline="", encoding="utf-8") as f:
    #writer = csv.writer(f, lineterminator ="\n")
    #writer.writerow(['Name', 'company/school', 'position/degree', 'start_date', 'end_date', 'url'])
    #writer.writerows(linkedin_data01)