Untitled

#!/usr/env/bin python
# -*- coding: utf-8 -*-

'''
Input CSV files fetcher.
Handle input and output files.
Also handle profiles and webpage.

Author:  Arount (Arnout Pierre - pierre@arount.info)
Package: Salescraper
Version: 0.0
'''

import os.path
import csv
import time
from random import randint

from salescraper.logger import logger
from salescraper.browser import browser
from selenium.webdriver.common.keys import Keys


class Fetcher(object):
    '''
    Fetch input csv.
    Is an iterator.
    '''


    def __init__(self, inputpath, outputdir='_output'):
        self.inputpath = inputpath


    def __iter__(self):
        '''
        Iterate on self.csv items
        '''
        source = open(self.inputpath, 'r', encoding='utf-8')
        self.csv = csv.reader(source, delimiter=',', quotechar='"')
        next(self.csv)
        return self


    def __next__(self):
        '''
        Cast CSV line into Profile instance here to avoid useless memory usage
        '''
        return Profile(self._line_to_dict(next(self.csv)))


    def _line_to_dict(self, line):
        '''
        It's a map
        '''
        return {
            "raw_id": line[0],
            "id": line[0].split(',')[0],
            "full_name": line[1],
            "uri": line[2],
            "first_name": line[3],
            "last_name": line[4],
            "avatar": line[5],
            "title": line[6],
            "company": line[7],
            "position": line[8]
        }


class Profile(object):
    '''
    A profile can be seen as the Python representation of a linkedin profile page.
    It store all informations from source CSV and is able to fetch
    the online page to extract more information (via contexts).
    '''

    # Profile is in charge of handling sleep times to avoid being blocked.
    sleep_time = (4, 8)
    random_sleep_time = (6, 16)


    def __init__(self, profiledict):
        self.fetched = False
        self.attrs = profiledict
        self.uri = self.attrs['uri']

        self.version = open(os.path.join('data/', 'version.txt')).read().strip()


    def __repr__(self):
        return '<Profile "{full_name}", id:{id}, fetched?:{0}>'.format(self.fetched, **self.attrs)


    def __enter__(self):
        return self

    def __call__(self):
        '''
        Enter in context,
        Open webpage and handle waiting time
        '''
        logger.log('Querying uri \'{}\' for \'{}\' ({})'.format(
                self.uri,
                self.attrs['full_name'],
                self.attrs['id']
            ), logtype='Profile', level=2
        )
        browser.get(self.uri)

        seconds = randint(*self.sleep_time)
        logger.log('Waiting {} seconds..'.format(seconds),
            logtype='Profile:__call__', level=2
        )
        time.sleep(seconds)
        if self.version == '2':
            # Click on "see more" description link
            try:
                browser.driver.find_element_by_css_selector('.profile-topcard__summary-expand-link').click()
                time.sleep(0.5)
                data = browser.driver.find_element_by_css_selector('.profile-topcard__summary-modal-content > p').text
                self._raw_data = {"summary": data}
            except:
                self._raw_data = {"summary": ""}

            browser.scroll_down()

        # Raw HTML is computed after some seconds to be sure all JS is actually
        # loaded and executed
        self.raw_html = browser.driver.page_source
        if self.version == '2':
            self.raw_html += '<div class="__salescraper-summary">{}</div>'.format(self._raw_data['summary'])
        return self


    def __exit__(self, exc_type, exc_val, exc_tb):
        '''
        Perform a last and random sleep before leaving context.
        This ensure and securize the script to wait as expected.
        '''
        seconds = randint(*self.random_sleep_time)
        logger.log('Waiting {} seconds..'.format(seconds),
            logtype='Profile:__exit__', level=2
        )
        time.sleep(seconds)