Advertisement
Guest User

Untitled

a guest
Jul 7th, 2016
100
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.64 KB | None | 0 0
  1. import cookielib
  2. import os
  3. import urllib
  4. import urllib2
  5. import re
  6. import string
  7. from bs4 import BeautifulSoup
  8.  
  9. username = "nasir.quadree@gmail.com"
  10. password = "obama44"
  11.  
  12. cookie_filename = "parser.cookies.txt"
  13.  
  14. class LinkedInParser(object):
  15.  
  16. def __init__(self, login, password):
  17. """ Start up... """
  18. self.login = login
  19. self.password = password
  20.  
  21. # Simulate browser with cookies enabled
  22. self.cj = cookielib.MozillaCookieJar(cookie_filename)
  23. if os.access(cookie_filename, os.F_OK):
  24. self.cj.load()
  25. self.opener = urllib2.build_opener(
  26. urllib2.HTTPRedirectHandler(),
  27. urllib2.HTTPHandler(debuglevel=0),
  28. urllib2.HTTPSHandler(debuglevel=0),
  29. urllib2.HTTPCookieProcessor(self.cj)
  30. )
  31. self.opener.addheaders = [
  32. ('User-agent', ('Mozilla/4.0 (compatible; MSIE 6.0; '
  33. 'Windows NT 5.2; .NET CLR 1.1.4322)'))
  34. ]
  35.  
  36. # Login
  37. self.loginPage()
  38.  
  39. title = self.loadTitle()
  40. print title
  41. self.cj.save()
  42.  
  43.  
  44. def loadPage(self, url, data=None):
  45. """
  46. Utility function to load HTML from URLs for us with hack to continue despite 404
  47. """
  48. # We'll print the url in case of infinite loop
  49. # print "Loading URL: %s" % url
  50. try:
  51. if data is not None:
  52. response = self.opener.open(url, data)
  53. else:
  54. response = self.opener.open(url)
  55. return ''.join(response.readlines())
  56. except:
  57. # If URL doesn't load for ANY reason, try again...
  58. # Quick and dirty solution for 404 returns because of network problems
  59. # However, this could infinite loop if there's an actual problem
  60. return self.loadPage(url, data)
  61.  
  62. def loginPage(self):
  63. """
  64. Handle login. This should populate our cookie jar.
  65. """
  66. html = self.loadPage("https://www.linkedin.com/")
  67. soup = BeautifulSoup(html)
  68. csrf = soup.find(id="loginCsrfParam-login")['value']
  69.  
  70. login_data = urllib.urlencode({
  71. 'session_key': self.login,
  72. 'session_password': self.password,
  73. 'loginCsrfParam': csrf,
  74. })
  75.  
  76. html = self.loadPage("https://www.linkedin.com/uas/login-submit", login_data)
  77. return
  78.  
  79. def loadTitle(self):
  80. html = self.loadPage("http://www.linkedin.com/nhome")
  81. soup = BeautifulSoup(html)
  82. return soup.find("title")
  83.  
  84. def getUserInfo(self, url):
  85. html = self.loadPage(url)
  86. soup = BeautifulSoup(html)
  87. user_info = {}
  88.  
  89. #get user locality
  90. if len(soup.find_all("span", class_='locality')) == 0:
  91. locality = "N/A"
  92. else:
  93. locality = soup.find_all("span", class_='locality')[0].string
  94. user_info["locality"] = locality
  95.  
  96. #get user name
  97. if len(soup.find_all("h1", id='name')) == 0:
  98. name = "N/A"
  99. else:
  100. name = soup.find_all("h1", id='name')[0].string
  101. # print soup.find_all("div", class_='profile-overview')
  102. user_info["name"] = name
  103.  
  104. #get title
  105. if len(soup.find_all("p", class_="title")) == 0:
  106. title = "N/A"
  107. else:
  108. title = soup.find_all("p", class_="title")[0].string
  109. user_info["title"] = title
  110.  
  111. print soup.find_all("div", id="email")
  112.  
  113. return user_info
  114.  
  115. parser = LinkedInParser(username, password)
  116. print parser.getUserInfo("https://www.linkedin.com/in/quinton-lampkin-93527613")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement