Advertisement
Guest User

LinkedIn Scraper

a guest
Sep 27th, 2016
639
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.82 KB | None | 0 0
  1. # Usage: python linkedout.py $linkedin_email $linkedin_password $linkedin_search_company $email_format
  2.  
  3. import sys
  4. import re
  5. from robobrowser import RoboBrowser
  6. import bs4
  7. from requests import Session
  8.  
  9. if len(sys.argv) < 5:
  10.     print("Usage: python linkedout.py $linkedin_email $linkedin_password $linkedin_search_company $email_format\n\nemail format schema:\nfirstmiddlelast@domain.com\nfmiddlelast@domain.com\nfml@domain.com\nfmlast@domain.com\nflast@domain.com\n\n")
  11.     sys.exit(0)
  12.  
  13. session = Session()
  14.  
  15. collected = 0
  16. username = sys.argv[1]
  17. password = sys.argv[2]
  18. company_search = sys.argv[3]
  19. email_format = sys.argv[4]
  20.  
  21. real_format = email_format.split("@")
  22.  
  23. # Browse to Genius
  24. browser = RoboBrowser(parser='html.parser',session=session,user_agent='Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:49.0) Gecko/20100101 Firefox/49.0')
  25. browser.open('https://www.linkedin.com/uas/login?session_redirect=https://www.linkedin.com/vsearch/p?company=' + company_search + '&openAdvancedForm=true&companyScope=C&fromSignIn=true&trk=uno-reg-join-sign-in')
  26.  
  27. form = browser.get_form(action="https://www.linkedin.com/uas/login-submit")
  28.  
  29. form['session_key'].value = username
  30. form['session_password'].value = password
  31. browser.submit_form(form)
  32.  
  33. while str(browser.parsed).find("Next") != -1:
  34.     str_src = str(browser.parsed)
  35.  
  36.     pattern = re.compile('(?<="fmt_name":")(.*?)(?=",)')
  37.     results = pattern.findall(str_src)
  38.  
  39.     collected += len(results)
  40.     print("\n\nCollected: [" + str(collected) + "] - " + company_search + " Employees...\n\n")
  41.  
  42.     for person in results:
  43.         if person.find("LinkedIn Member") == -1:
  44.             person = person.replace("\u002d", "-")
  45.             names = re.findall('\w+', person)
  46.            
  47.             if real_format[0] == "firstmiddlelast":
  48.                 file_str = ''.join(names)
  49.             elif real_format[0] == "fmiddlelast":
  50.                 file_str = min(names)[0]
  51.                 if len(names) == 3:
  52.                     file_str += names[1]
  53.                 file_str += max(names)
  54.             elif real_format[0] == "fml":
  55.                 file_str = min(names)[0]
  56.                 if len(names) == 3:
  57.                     file_str += names[1][0]
  58.                 file_str += max(names)[0]
  59.             elif real_format[0] == "fmlast":
  60.                 file_str = min(names)[0]
  61.                 if len(names) == 3:
  62.                     file_str += names[1][0]
  63.                 file_str += max(names)
  64.             elif real_format[0] == "flast":
  65.                 file_str = min(names)[0] + max(names)
  66.  
  67.         print(file_str + "@" + real_format[1])
  68.  
  69.         with open(company_search + ".txt", "a") as f:
  70.             f.write(file_str + "@" + real_format[1] + "\n")
  71.  
  72.     # for debugging
  73.     # with open("src.html", "w") as f:
  74.     #     f.write(str_src)
  75.  
  76.     next_page = re.compile('(?<=isCurrentPage":true,"pageURL":")(.*?)(?=","pageNum")')
  77.     next_url = next_page.findall(str_src)
  78.  
  79.     page_num = int(next_url[0].split("page_num=")[1])
  80.     page_num += 1
  81.  
  82.     goto_url = next_url[0].split("page_num=")[0] + "page_num=" + str(page_num)
  83.  
  84.     browser.open('https://www.linkedin.com' + goto_url)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement