pars_ledron

from xls_prep import create_n_fill, read
from config import *

import logging
from grab.spider import Spider, Task
from datetime import datetime

import shutil
import requests

import pprint
import json
import csv

attributes = []
categories = []
items = {}
product_id = 0

class Crawler(Spider):
    initial_urls = [SITE_URL]

    def task_initial(self, grab, task):
        if VERBOSE:
            print("TASK : initial  " + task.url)
        grab.setup(proxy=PROXY_LIST[1])

        for cat in grab.doc.select(INITIAL_SELECTOR):
            link = cat.attr('href')
            cat_name = cat.text()
            categories.append(cat_name)
            yield Task('category', url=link, cat=cat_name)


    def task_category(self, grab, task):
        if VERBOSE:
            print("TASK : category  " + task.cat)

        for k in grab.doc.select(CATEGORY_SELECTOR):

            item_link=k.attr("href")
            # product_id += 1
            if task.cat not in categories:
                categories.append(task.cat)
            yield Task('item',url=item_link, cat=task.cat, product_id=product_id)

    def task_item(self, grab, task):

        if VERBOSE:
            print("TASK : item  " + task.url)
        item_pics = []

        item_title = grab.doc.select(ITEM_TITLE_SELECTOR).text()
        item_price = grab.doc.select(ITEM_PRICE_SELECTOR).text().replace(',','')
        item_attr = grab.doc.select(ITEM_ATTR_SELECTOR).html()
        item_attr = clear_desc(item_attr)
        item_desc = grab.doc.select(ITEM_DESC_SELECTOR).text()
        if item_desc.startswith('Описание товара'):
            item_desc = item_desc[15:]
        item_cat = categories.index(task.cat) +1  # adapt to opencart category numeration
        product_id = len(items)
        node_pics = grab.doc.select('//div[contains(@class,"images")]')
        for elem in node_pics:
            item_pics.append(elem.select('./a').attr('href'))

        node_pics = grab.doc.select('//div[contains(@class,"thumbnails")]')
        for elem in node_pics:
            if elem not in item_pics:
                item_pics.append(elem.select('./a').attr('href'))

        item_attr = attr_divide(item_attr)

        if task.url not in items.keys():
            items[task.url] = {
                'id': product_id,\
                'title':item_title,\
                'price':item_price,\
                'cat': item_cat,\
                'desc': item_desc,\
                'pics': item_pics,\
                'attr': item_attr}

        # # print(items)


        # print(item_price)
        # print(item_cat)
        # print(item_desc + '\n')
        # print(item_pics)
        # for item in item_pics:
        #     save_img(item)

def attr_divide(attr):
    attr = attr.replace('\xa0',' ').replace('\r','').replace('<div class="prop_table">','').replace('<div>','').replace('<span class="prop_name">','').replace('<span style="line-height','').strip()
    item_attr = attr.split('\n')
    for p in item_attr:

        if p == 'Габариты ':
            item_attr[item_attr.index(p)] = 'Габариты: '
        attr_key= p.split(':')[0]
        if attr_key not in attributes:
            attributes.append(attr_key)
        print(attr_key)
    # return(item_attr)

def save_img(url):
    filename = url.split('/')[-1]
    response = requests.get(url, stream=True)

    with open(filename, 'wb') as out_file:
        shutil.copyfileobj(response.raw, out_file)
    del response
    print(filename + '  saved')
    return

def clear_desc(item_desc):
    return item_desc.replace('<p>','').\
                    replace('</p>',' ').\
                    replace('<p class="record_title">','').\
                    replace('<div itemprop="description">','').\
                    replace('</div>','').replace('\t','')


def prepare_xlsx():
    create_n_fill(PRODUCTS_NAME, PRODUCTS_HEADERS, PRODUCTS_SHEETS)
    create_n_fill(CATEGORIES_NAME , CATEGORIES_HEADERS , CATEGORIES_SHEETS)

def trim(url): # absolute url to relative
    new_url = url[len(SITE_URL)+1:]
    return new_url


def sql_export_attributes():
    print('TRUNCATE TABLE `oc_attribute_description`;')
    for p in attributes:
        print('INSERT INTO `oc_attribute_description` (`attribute_id`, `language_id`, `name`) VALUES (\''+ str(100+attributes.index(p)) + '\', \'' + str(1) +'\', \''+ p + '\');')
        print('INSERT INTO `oc_attribute_description` (`attribute_id`, `language_id`, `name`) VALUES (\''+ str(100+attributes.index(p)) + '\', \'' + str(2) +'\', \''+ p + '\');')

def main():
    bot = Crawler()
    # prepare_xlsx()


    logging.basicConfig(filename='ledron.log',level=logging.DEBUG)

    try:
        bot.run()
    except KeyboardInterrupt:
        print('Keyboard Interrupt')

    read('import-export/products.xlsx',items.values())
    sql_export_attributes()
if __name__ == '__main__':
    main()