Advertisement
Guest User

Untitled

a guest
Mar 21st, 2019
320
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 15.05 KB | None | 0 0
  1. # Scraping
  2.  
  3. linkedin_takram_0_50 = []
  4.  
  5. for linkedin_url in aaa[0:50]#takram_url_folder:
  6. # get the profile URL
  7. driver.get(linkedin_url)
  8. #add a 20 second pause loading each URL
  9. sleep(10)
  10. # assigning the source code for the webpage to variable sel
  11. sel = Selector(text=driver.page_source)
  12.  
  13. #open_show_more_if_exists
  14. try:
  15. sel.xpath('//*[starts-with(@class, "pv-experience-section__see-more")]/button')
  16. print("open_show_more")
  17. show_more_button = driver.find_element_by_xpath('//*[starts-with(@class, "pv-experience-section__see-more")]/button')
  18. show_more_button.click()
  19. sleep(3)
  20. except:
  21. print("Didn't_open_show_more")
  22.  
  23. #if_Nfor1_exists
  24. try:
  25. driver.find_element_by_xpath('//*[starts-with(@class, "pv-entity__company-details")]')
  26. print("nfor1_exists")
  27. #work
  28. nfor1company_history = sel.xpath('//*[starts-with(@class, "pv-entity__company-details")]/div[2]/h3/span[2]/text()').extract() ##
  29. nfor1position_history = sel.xpath('//*[starts-with(@class, "pv-entity__position-group-role-item")]/div/div/div/div[1]/h3/span[2]/text()').extract_first() ###
  30. nfor1work_date1 = sel.xpath ('//*[starts-with(@class, "pv-entity__position-group-role-item")]/div/div/div/div/div[1]/h4[1]/span[2]/text()').extract_first()
  31. company_history = sel.xpath('//*[starts-with(@class, "pv-entity__secondary-title")]/text()').extract()
  32. position_history = sel.xpath('//*[starts-with(@class, "t-16 t-black t-bold")]/text()').extract()
  33. work_date1 = sel.xpath('//*[starts-with(@class, "pv-entity__date-range t-14 t-black--light t-normal")]/span[2]//text()').extract()
  34.  
  35. #education
  36. #education_detail には "degree"と"field of study"があり、両方とるとdetail数が2倍になり、career_historyと数が合わなくなるため、degreeのみとる。
  37. education_history = sel.xpath('//*[starts-with(@class, "pv-entity__school-name t-16 t-black t-bold")]/text()').extract()
  38. degree_history = sel.xpath('//*[starts-with(@class, "pv-entity__secondary-title pv-entity__degree-name pv-entity__secondary-title t-14 t-black t-normal")]/span[2]/text()').extract()
  39. s_education_date = sel.xpath('//*[starts-with(@class, "pv-entity__dates t-14 t-black--light t-normal")]/span/time[1]/text()').extract()
  40. e_education_date = sel.xpath('//*[starts-with(@class, "pv-entity__dates t-14 t-black--light t-normal")]/span/time[2]/text()').extract()
  41.  
  42. #name
  43. name = sel.xpath('//*[starts-with(@class, "pv-top-card-section__name")]/text()').extract_first()
  44.  
  45. # validating if the fields exist on the profile
  46. name = validate_field(name)
  47. nfor1company_history = validate_field(nfor1company_history) ###
  48. company_history = validate_field(company_history)
  49. nfor1position_history = validate_field(nfor1position_history) ###
  50. position_history = validate_field(position_history)
  51. nfor1work_date1 = validate_field(nfor1work_date1) ###
  52. work_date1 = validate_field(work_date1)
  53. education_history = validate_field(education_history)
  54. degree_history = validate_field(degree_history)
  55. s_education_date = validate_field(s_education_date)
  56. e_education_date = validate_field(e_education_date)
  57.  
  58. #data_arrangement
  59.  
  60. #company_history
  61.  
  62. if len(nfor1company_history) ==0:
  63. nfor1company_history.append("NaN – NaN") ###
  64.  
  65. if len(company_history)==0:
  66. company_history.append("NaN – NaN")
  67.  
  68. while "\n " in company_history:
  69. company_history.remove("\n ")
  70.  
  71. while "\n " in company_history:
  72. company_history.remove("\n ")
  73.  
  74. while "\n " in company_history:
  75. company_history.remove("\n ")
  76.  
  77. #position_history
  78.  
  79. if len(nfor1position_history) == 0:
  80. nfor1position_history.append("NaN – NaN")
  81.  
  82. if len(position_history)==0:
  83. position_history.append("NaN – NaN")
  84.  
  85. while "\n " in position_history:
  86. position_history.remove("\n ")
  87.  
  88. while "\n " in position_history:
  89. position_history.remove("\n ")
  90.  
  91. while "\n " in position_history:
  92. position_history.remove("\n ")
  93.  
  94. while "Messaging Settings" in position_history:
  95. position_history.remove("Messaging Settings")
  96.  
  97. #work_date (s_new_work_data, e_new_work_date)
  98.  
  99. while len(nfor1company_history) > len(nfor1work_date1):
  100. nfor1work_date1.append("NaN – NaN")
  101.  
  102. while len(company_history) > len(work_date1):
  103. work_date1.append("NaN – NaN")
  104.  
  105. nfor1work_date2 = []
  106. nfor1work_date2.append(nfor1work_date1)
  107.  
  108. work_date2 = []
  109. for w in work_date1:
  110. if len(w) == 8:
  111. w = w + " – NaN"
  112. if len(w) == 4:
  113. w = w + " – NaN"
  114. work_date2.append(w)
  115.  
  116.  
  117. print(nfor1work_date1)
  118. print(nfor1work_date2)
  119. print(work_date2)
  120. nfor1_new_work_date = nfor1work_date2 + work_date2
  121. print(nfor1_new_work_date)
  122.  
  123. new_work_date = []
  124. for w in nfor1_new_work_date:
  125. new_work_date.append(w.split(" – "))
  126.  
  127. #new_work_dateは二重リスト。replaceはリストには使えないため、forを二回行う。
  128. new_work_date1 = []
  129. for d in new_work_date:
  130. for e in d:
  131. f = e.replace("Jan", "01 /").replace("Feb", "02 /").replace("Mar", "03 /").replace("Apr", "04 /").replace("May", "05 /").replace("Jun", "06 /").replace("Jul", "07 /").replace("Aug", "08 /").replace("Sep", "09 /").replace("Oct", "10 /").replace("Nov", "11 /").replace("Dec", "12 /")
  132. new_work_date1.append(f)
  133.  
  134. s_new_work_date = new_work_date1[0::2]
  135. e_new_work_date = new_work_date1[1::2]
  136.  
  137. #for i in range(len(new_work_date)):
  138. #s_new_work_date.append(new_work_date1[0::2])
  139. #e_new_work_date.append(new_work_date1[1::2])
  140.  
  141. #education_history, degree_history
  142. while len(degree_history) < len(education_history):
  143. degree_history.append("NaN – NaN")
  144.  
  145. #extend(company_history, education_history)
  146.  
  147. career_history = nfor1company_history + company_history + education_history
  148.  
  149. #extend(position_history, education_degree)
  150. print(nfor1position_history)
  151. nfor1position_history_list = []
  152. nfor1position_history_list.append(nfor1position_history)
  153. print(position_history)
  154. print(degree_history)
  155. detail_history = nfor1position_history_list + position_history + degree_history
  156.  
  157. #extend(s_date, e_date)
  158.  
  159. s_date = s_new_work_date + s_education_date
  160. e_date = e_new_work_date + e_education_date
  161.  
  162. #name
  163.  
  164. if name:
  165. name = name.strip()
  166.  
  167. name_list = []
  168. name_list.append(name)
  169.  
  170. while len(name_list) < (len(s_new_work_date) + len(s_education_date)):
  171. name_list.append(name)
  172.  
  173. url_list =[]
  174. while len(url_list) < len(name_list):
  175. url_list.append(linkedin_url)
  176.  
  177. print("company_history = {}".format(len(company_history)))
  178. print(company_history)
  179.  
  180. print("position_history = {}".format(len(position_history)))
  181. print(position_history)
  182.  
  183. print("work_date")
  184. print(work_date1)
  185.  
  186. print("work_date2")
  187. print(work_date2)
  188.  
  189. print("new_work_date")
  190. print(new_work_date)
  191.  
  192. print("new_work_date1")
  193. print(new_work_date1)
  194.  
  195. print("education_history = {}".format(len(education_history)))
  196. print(education_history)
  197.  
  198. print("s_new_work_date")
  199. print(s_new_work_date)
  200.  
  201. print("e_new_work_date")
  202. print(e_new_work_date)
  203.  
  204. print("degree_history")
  205. print(degree_history)
  206.  
  207. print("@1@ career_history = {}".format(len(career_history)))
  208. print(career_history)
  209.  
  210. print("@2@ detail_history = {}".format(len(detail_history)))
  211. print(detail_history)
  212.  
  213. print("@3@ s_date = {}".format(len(s_date)))
  214. print(s_date)
  215.  
  216. print("@4@ e_date = {}".format(len(e_date)))
  217. print(e_date)
  218.  
  219. print("@5@ name_list = {}".format(len(name_list)))
  220. print(name_list)
  221.  
  222. print("@6@ url_list = {}".format(len(url_list)))
  223. print(linkedin_url)
  224.  
  225. #data_append
  226. try:
  227. for i in range (len(name_list)):
  228. linkedin_takram_0_50.append([name_list[i], career_history[i], detail_history[i], s_date[i], e_date[i], url_list[i]])
  229. except IndexError:
  230. print("Index Error _ Nfo1")
  231.  
  232. #if_not_Nfor1_exists
  233. except:
  234. print("Not_nfor1_exists")
  235. #work
  236. company_history = sel.xpath('//*[starts-with(@class, "pv-entity__secondary-title")]/text()').extract()
  237. position_history = sel.xpath('//*[starts-with(@class, "t-16 t-black t-bold")]/text()').extract()
  238. work_date1 = sel.xpath('//*[starts-with(@class, "pv-entity__date-range t-14 t-black--light t-normal")]/span[2]//text()').extract()
  239.  
  240. #education
  241. #education_detail には "degree"と"field of study"があり、両方とるとdetail数が2倍になり、career_historyと数が合わなくなるため、degreeのみとる。
  242. education_history = sel.xpath('//*[starts-with(@class, "pv-entity__school-name t-16 t-black t-bold")]/text()').extract()
  243. degree_history = sel.xpath('//*[starts-with(@class, "pv-entity__secondary-title pv-entity__degree-name pv-entity__secondary-title t-14 t-black t-normal")]/span[2]/text()').extract()
  244. s_education_date = sel.xpath('//*[starts-with(@class, "pv-entity__dates t-14 t-black--light t-normal")]/span/time[1]/text()').extract()
  245. e_education_date = sel.xpath('//*[starts-with(@class, "pv-entity__dates t-14 t-black--light t-normal")]/span/time[2]/text()').extract()
  246.  
  247. #name
  248. name = sel.xpath('//*[starts-with(@class, "pv-top-card-section__name")]/text()').extract_first()
  249.  
  250. # validating if the fields exist on the profile
  251. name = validate_field(name)
  252. company_history = validate_field(company_history)
  253. position_history = validate_field(position_history)
  254. work_date1 = validate_field(work_date1)
  255. education_history = validate_field(education_history)
  256. degree_history = validate_field(degree_history)
  257. s_education_date = validate_field(s_education_date)
  258. e_education_date = validate_field(e_education_date)
  259.  
  260. #data_arrangement
  261.  
  262. #company_history
  263.  
  264. if len(company_history)==0:
  265. company_history.append('NaN – NaN')
  266.  
  267. while "\n " in company_history:
  268. company_history.remove("\n ")
  269.  
  270. while "\n " in company_history:
  271. company_history.remove("\n ")
  272.  
  273. while "\n " in company_history:
  274. company_history.remove("\n ")
  275.  
  276. #position_history
  277.  
  278. if len(position_history)==0:
  279. position_history.append("NaN – NaN")
  280.  
  281. while "\n " in position_history:
  282. position_history.remove("\n ")
  283.  
  284. while "\n " in position_history:
  285. position_history.remove("\n ")
  286.  
  287. while "\n " in position_history:
  288. position_history.remove("\n ")
  289.  
  290. while "Messaging Settings" in position_history:
  291. position_history.remove("Messaging Settings")
  292.  
  293. #work_date (s_new_work_data, e_new_work_date)
  294.  
  295. while len(company_history) > len(work_date1):
  296. work_date1.append("NaN – NaN")
  297.  
  298. work_date2 = []
  299. for w in work_date1:
  300. if len(w) == 8:
  301. w = w + " – NaN"
  302. if len(w) == 4:
  303. w = w + " – NaN"
  304. work_date2.append(w)
  305.  
  306. new_work_date = []
  307. for w in work_date2:
  308. new_work_date.append(w.split(" – "))
  309.  
  310. #new_work_dateは二重リスト。replaceはリストには使えないため、forを二回行う。
  311. new_work_date1 = []
  312. for d in new_work_date:
  313. for e in d:
  314. f = e.replace("Jan", "01 /").replace("Feb", "02 /").replace("Mar", "03 /").replace("Apr", "04 /").replace("May", "05 /").replace("Jun", "06 /").replace("Jul", "07 /").replace("Aug", "08 /").replace("Sep", "09 /").replace("Oct", "10 /").replace("Nov", "11 /").replace("Dec", "12 /")
  315. new_work_date1.append(f)
  316.  
  317. s_new_work_date = new_work_date1[0::2]
  318. e_new_work_date = new_work_date1[1::2]
  319.  
  320. #education_history, degree_history
  321. while len(degree_history) < len(education_history):
  322. degree_history.append("NaN – NaN")
  323.  
  324. #extend(company_history, education_history)
  325.  
  326. career_history = company_history + education_history
  327.  
  328. #extend(position_history, education_degree)
  329. detail_history = position_history + degree_history
  330.  
  331. #extend(s_date, e_date)
  332.  
  333. s_date = s_new_work_date + s_education_date
  334. e_date = e_new_work_date + e_education_date
  335.  
  336. #name
  337.  
  338. if name:
  339. name = name.strip()
  340.  
  341. name_list = []
  342. name_list.append(name)
  343.  
  344. while len(name_list) < (len(s_new_work_date) + len(s_education_date)):
  345. name_list.append(name)
  346.  
  347. url_list =[]
  348. while len(url_list) < len(name_list):
  349. url_list.append(linkedin_url)
  350.  
  351. print("company_history = {}".format(len(company_history)))
  352. print(company_history)
  353.  
  354. print("position_history = {}".format(len(position_history)))
  355. print(position_history)
  356.  
  357. print("work_date")
  358. print(work_date1)
  359.  
  360. print("work_date2")
  361. print(work_date2)
  362.  
  363. print("new_work_date")
  364. print(new_work_date)
  365.  
  366. print("new_work_date1")
  367. print(new_work_date1)
  368.  
  369. print("education_history = {}".format(len(education_history)))
  370. print(education_history)
  371.  
  372. print("s_new_work_date")
  373. print(s_new_work_date)
  374.  
  375. print("e_new_work_date")
  376. print(e_new_work_date)
  377.  
  378. print("degree_history")
  379. print(degree_history)
  380.  
  381. print("@1@ career_history = {}".format(len(career_history)))
  382. print(career_history)
  383.  
  384. print("@2@ detail_history = {}".format(len(detail_history)))
  385. print(detail_history)
  386.  
  387. print("@3@ s_date = {}".format(len(s_date)))
  388. print(s_date)
  389.  
  390. print("@4@ e_date = {}".format(len(e_date)))
  391. print(e_date)
  392.  
  393. print("@5@ name_list = {}".format(len(name_list)))
  394. print(name_list)
  395.  
  396. print("@6@ url_list = {}".format(len(url_list)))
  397. print(linkedin_url)
  398.  
  399. #data_append
  400. try:
  401. for i in range (len(name_list)):
  402. linkedin_takram_0_50.append([name_list[i], career_history[i], detail_history[i], s_date[i], e_date[i], url_list[i]])
  403. except IndexError:
  404. print("Index Error_not_Nfor1")
  405.  
  406. #with open("linkedin_data01.csv", "w", newline="", encoding="utf-8") as f:
  407. #writer = csv.writer(f, lineterminator ="\n")
  408. #writer.writerow(['Name', 'company/school', 'position/degree', 'start_date', 'end_date', 'url'])
  409. #writer.writerows(linkedin_data01)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement