Advertisement
renierb

Untitled

Jul 17th, 2018
126
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 4.10 KB | None | 0 0
  1. #!/usr/env/bin python
  2. # -*- coding: utf-8 -*-
  3.  
  4. '''
  5. Input CSV files fetcher.
  6. Handle input and output files.
  7. Also handle profiles and webpage.
  8.  
  9. Author: Arount (Arnout Pierre - pierre@arount.info)
  10. Package: Salescraper
  11. Version: 0.0
  12. '''
  13.  
  14. import os.path
  15. import csv
  16. import time
  17. from random import randint
  18.  
  19. from salescraper.logger import logger
  20. from salescraper.browser import browser
  21. from selenium.webdriver.common.keys import Keys
  22.  
  23.  
  24. class Fetcher(object):
  25. '''
  26. Fetch input csv.
  27. Is an iterator.
  28. '''
  29.  
  30.  
  31. def __init__(self, inputpath, outputdir='_output'):
  32. self.inputpath = inputpath
  33.  
  34.  
  35. def __iter__(self):
  36. '''
  37. Iterate on self.csv items
  38. '''
  39. source = open(self.inputpath, 'r', encoding='utf-8')
  40. self.csv = csv.reader(source, delimiter=',', quotechar='"')
  41. next(self.csv)
  42. return self
  43.  
  44.  
  45. def __next__(self):
  46. '''
  47. Cast CSV line into Profile instance here to avoid useless memory usage
  48. '''
  49. return Profile(self._line_to_dict(next(self.csv)))
  50.  
  51.  
  52. def _line_to_dict(self, line):
  53. '''
  54. It's a map
  55. '''
  56. return {
  57. "raw_id": line[0],
  58. "id": line[0].split(',')[0],
  59. "full_name": line[1],
  60. "uri": line[2],
  61. "first_name": line[3],
  62. "last_name": line[4],
  63. "avatar": line[5],
  64. "title": line[6],
  65. "company": line[7],
  66. "position": line[8]
  67. }
  68.  
  69.  
  70. class Profile(object):
  71. '''
  72. A profile can be seen as the Python representation of a linkedin profile page.
  73. It store all informations from source CSV and is able to fetch
  74. the online page to extract more information (via contexts).
  75. '''
  76.  
  77. # Profile is in charge of handling sleep times to avoid being blocked.
  78. sleep_time = (4, 8)
  79. random_sleep_time = (6, 16)
  80.  
  81.  
  82. def __init__(self, profiledict):
  83. self.fetched = False
  84. self.attrs = profiledict
  85. self.uri = self.attrs['uri']
  86.  
  87. self.version = open(os.path.join('data/', 'version.txt')).read().strip()
  88.  
  89.  
  90. def __repr__(self):
  91. return '<Profile "{full_name}", id:{id}, fetched?:{0}>'.format(self.fetched, **self.attrs)
  92.  
  93.  
  94. def __enter__(self):
  95. return self
  96.  
  97. def __call__(self):
  98. '''
  99. Enter in context,
  100. Open webpage and handle waiting time
  101. '''
  102. logger.log('Querying uri \'{}\' for \'{}\' ({})'.format(
  103. self.uri,
  104. self.attrs['full_name'],
  105. self.attrs['id']
  106. ), logtype='Profile', level=2
  107. )
  108. browser.get(self.uri)
  109.  
  110. seconds = randint(*self.sleep_time)
  111. logger.log('Waiting {} seconds..'.format(seconds),
  112. logtype='Profile:__call__', level=2
  113. )
  114. time.sleep(seconds)
  115. if self.version == '2':
  116. # Click on "see more" description link
  117. try:
  118. browser.driver.find_element_by_css_selector('.profile-topcard__summary-expand-link').click()
  119. time.sleep(0.5)
  120. data = browser.driver.find_element_by_css_selector('.profile-topcard__summary-modal-content > p').text
  121. self._raw_data = {"summary": data}
  122. except:
  123. self._raw_data = {"summary": ""}
  124.  
  125. browser.scroll_down()
  126.  
  127. # Raw HTML is computed after some seconds to be sure all JS is actually
  128. # loaded and executed
  129. self.raw_html = browser.driver.page_source
  130. if self.version == '2':
  131. self.raw_html += '<div class="__salescraper-summary">{}</div>'.format(self._raw_data['summary'])
  132. return self
  133.  
  134.  
  135. def __exit__(self, exc_type, exc_val, exc_tb):
  136. '''
  137. Perform a last and random sleep before leaving context.
  138. This ensure and securize the script to wait as expected.
  139. '''
  140. seconds = randint(*self.random_sleep_time)
  141. logger.log('Waiting {} seconds..'.format(seconds),
  142. logtype='Profile:__exit__', level=2
  143. )
  144. time.sleep(seconds)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement