Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/env/bin python
- # -*- coding: utf-8 -*-
- '''
- Input CSV files fetcher.
- Handle input and output files.
- Also handle profiles and webpage.
- Author: Arount (Arnout Pierre - pierre@arount.info)
- Package: Salescraper
- Version: 0.0
- '''
- import os.path
- import csv
- import time
- from random import randint
- from salescraper.logger import logger
- from salescraper.browser import browser
- from selenium.webdriver.common.keys import Keys
- class Fetcher(object):
- '''
- Fetch input csv.
- Is an iterator.
- '''
- def __init__(self, inputpath, outputdir='_output'):
- self.inputpath = inputpath
- def __iter__(self):
- '''
- Iterate on self.csv items
- '''
- source = open(self.inputpath, 'r', encoding='utf-8')
- self.csv = csv.reader(source, delimiter=',', quotechar='"')
- next(self.csv)
- return self
- def __next__(self):
- '''
- Cast CSV line into Profile instance here to avoid useless memory usage
- '''
- return Profile(self._line_to_dict(next(self.csv)))
- def _line_to_dict(self, line):
- '''
- It's a map
- '''
- return {
- "raw_id": line[0],
- "id": line[0].split(',')[0],
- "full_name": line[1],
- "uri": line[2],
- "first_name": line[3],
- "last_name": line[4],
- "avatar": line[5],
- "title": line[6],
- "company": line[7],
- "position": line[8]
- }
- class Profile(object):
- '''
- A profile can be seen as the Python representation of a linkedin profile page.
- It store all informations from source CSV and is able to fetch
- the online page to extract more information (via contexts).
- '''
- # Profile is in charge of handling sleep times to avoid being blocked.
- sleep_time = (4, 8)
- random_sleep_time = (6, 16)
- def __init__(self, profiledict):
- self.fetched = False
- self.attrs = profiledict
- self.uri = self.attrs['uri']
- self.version = open(os.path.join('data/', 'version.txt')).read().strip()
- def __repr__(self):
- return '<Profile "{full_name}", id:{id}, fetched?:{0}>'.format(self.fetched, **self.attrs)
- def __enter__(self):
- return self
- def __call__(self):
- '''
- Enter in context,
- Open webpage and handle waiting time
- '''
- logger.log('Querying uri \'{}\' for \'{}\' ({})'.format(
- self.uri,
- self.attrs['full_name'],
- self.attrs['id']
- ), logtype='Profile', level=2
- )
- browser.get(self.uri)
- seconds = randint(*self.sleep_time)
- logger.log('Waiting {} seconds..'.format(seconds),
- logtype='Profile:__call__', level=2
- )
- time.sleep(seconds)
- if self.version == '2':
- # Click on "see more" description link
- try:
- browser.driver.find_element_by_css_selector('.profile-topcard__summary-expand-link').click()
- time.sleep(0.5)
- data = browser.driver.find_element_by_css_selector('.profile-topcard__summary-modal-content > p').text
- self._raw_data = {"summary": data}
- except:
- self._raw_data = {"summary": ""}
- browser.scroll_down()
- # Raw HTML is computed after some seconds to be sure all JS is actually
- # loaded and executed
- self.raw_html = browser.driver.page_source
- if self.version == '2':
- self.raw_html += '<div class="__salescraper-summary">{}</div>'.format(self._raw_data['summary'])
- return self
- def __exit__(self, exc_type, exc_val, exc_tb):
- '''
- Perform a last and random sleep before leaving context.
- This ensure and securize the script to wait as expected.
- '''
- seconds = randint(*self.random_sleep_time)
- logger.log('Waiting {} seconds..'.format(seconds),
- logtype='Profile:__exit__', level=2
- )
- time.sleep(seconds)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement