Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # *-* coding: utf-8 *-*
- from scrapy.spider import Spider
- from scrapy.selector import Selector
- from scrapy.crawler import Crawler
- from scrapy.settings import Settings
- from scrapy.http import Request
- import peewee
- import os.path
- import re
- from urllib import urlretrieve
- from datetime import datetime
- from twisted.internet import reactor
- from PIL import Image, ImageEnhance
- from shutil import rmtree
- from peewee import *
- # Database (test params)
- db = MySQLDatabase('scrapy', user='root', passwd='')
- path = 'mthimages/'
- # Create column to database
- class SpiderBase(peewee.Model):
- date_of_auction = peewee.CharField(max_length=14)
- auction_type = peewee.TextField()
- location = peewee.TextField()
- auction_time = peewee.CharField(max_length=6)
- lot_no = peewee.CharField(max_length=3)
- year = peewee.CharField(max_length=4)
- make = peewee.TextField()
- model = peewee.TextField()
- body = peewee.TextField()
- km = peewee.TextField()
- engine = peewee.TextField()
- reg = peewee.TextField()
- registration_province = peewee.TextField()
- registration_expiry = peewee.CharField(max_length=8)
- owners = peewee.TextField()
- drive = peewee.TextField()
- fees = peewee.TextField()
- grade = peewee.CharField(max_length=1)
- type_auc = peewee.CharField(max_length=3)
- link = peewee.TextField()
- folder = peewee.CharField(max_length=20)
- class Meta:
- database = db
- try:
- SpiderBase.create_table()
- except peewee.InternalError:
- pass
- list_links = [l.link for l in SpiderBase.select()]
- # Spider Scrapy
- class AutoSpider(Spider):
- name = 'auto'
- allowed_domains = ["site.com"]
- start_urls = [
- "site.com"
- ]
- def parse(self, response):
- sel = Selector(response)
- elements = ' '.join(sel.xpath('//*[@id="col-01"]/div[1]/ul[1]/li[4]/div[2]/strong/text()').extract())
- links = sel.xpath('//*[@id="col-01"]/div/div/ul/li/a/@href').extract()
- for link in links:
- link = '/'.join(link.split('/')[:-1]) + '/{0}/{0}'.format(str(elements)) + '/'
- yield Request(url=link, callback=self.parse_catalog)
- def parse_catalog(self, response):
- sel = Selector(response)
- links_auto = sel.xpath('//div[@class="car-detail-list"]/a/@href').extract()
- if bool(links_auto):
- for link_a in links_auto:
- if link_a not in list_links:
- print 'Link {0} sent for processing'.format(link_a)
- yield Request(url=link_a, callback=self.parse_page)
- else:
- print 'Link {0} there are in table'.format(link_a)
- def parse_page(self, response):
- self.sel = Selector(response)
- dic = {}
- dic['date_of_auction'] = ' '.join(self.sel.xpath('//*[@id="col-01"]/div/ul[1]/li[1]/descendant::text()').extract())
- dic['auction_type'] = ' '.join(self.sel.xpath('//*[@id="col-01"]/div/ul[1]/li[3]/h4/descendant::text()').extract())
- dic['location'] = ' '.join(self.sel.xpath('//*[@id="col-01"]/div/ul[1]/li[3]/p[1]/text()').extract())
- dic['auction_time'] = ' '.join(self.sel.xpath('//*[@id="col-01"]/div/ul[1]/li[3]/p[2]/text()').re('(\d{1,2}.\d{2})'))
- dic['lot_no'] = ' '.join(self.sel.xpath('//*[@id="col-01"]/div/h3/span/text()').re('\d+'))
- dic['year'] = ' '.join(self.year())
- dic['make'] = ' '.join(self.make())
- dic['model'] = ' '.join(self.model())
- dic['body'] = self.table('Body :')
- dic['km'] = self.table('KM. :')
- dic['engine'] = self.table('Engine :')
- dic['colour'] = self.table('Colour :')
- dic['reg'] = self.table('Reg. :')
- dic['registration_province'] = self.table('Reg. Province :')
- dic['registration_expiry'] = self.table('Reg. Expiry :')
- dic['owners'] = self.table('Owners :')
- dic['drive'] = self.table('Drive :')
- dic['fees'] = ' '.join(self.sel.xpath('//*[@id="col-01"]/div/div/div[2]/p[2]/text()').re('\w+'))
- dic['grade'] = ' '.join(self.sel.xpath('//*[@id="col-01"]/div/div/div[2]/div[2]/@class').re('-([a-z])')).upper()
- dic['rtr'] = ' '.join(self.sel.xpath('//*[@id="col-01"]/div/div/div[2]/div[2]/div/a[1]/@href').extract())
- dic['inspect'] = ' '.join(self.sel.xpath('//*[@id="col-01"]/div/div/div[2]/div[2]/div/a[2]/@href').extract())
- dic['type_auc'] = ''.join(self.sel.xpath('//*[@id="col-01"]/div/ul[1]/li[4]/div[1]/figure/img[contains(@src,'
- ' "-active")]/@title').re('[A-Z]+'))
- self.dic = dic
- """
- self.images_loading()
- print 'Images from {0} uploaded in folder {1}'.format(response.url, self.folder())
- """
- col = SpiderBase(
- date_of_auction=dic['date_of_auction'],
- auction_type=dic['auction_type'],
- location=dic['location'],
- auction_time=dic['auction_time'],
- lot_no=dic['lot_no'],
- year=dic['year'],
- make=dic['make'],
- model=dic['model'],
- body=dic['body'],
- km=dic['km'],
- engine=dic['engine'],
- reg=dic['reg'],
- registration_province=dic['registration_province'],
- registration_expiry=dic['registration_expiry'],
- owners=dic['owners'],
- drive=dic['drive'],
- fees=dic['fees'],
- grade=dic['grade'],
- type_auc=dic['type_auc'],
- link=response.url,
- folder=self.folder()
- )
- col.save()
- print 'Link {0} added to database'.format(response.url)
- def lot(self):
- return self.sel.xpath('//*[@class="helv"]/span/text()')[0].re('\d+')
- def year(self):
- self.helv_text = self.sel.xpath('//*[@class="helv"]/text()')
- return self.helv_text.re('\d{4}')
- def make(self):
- return self.helv_text.re('([a-zA-Z0-9]+)\s\:')
- def model(self):
- return self.helv_text.re(':\s+([a-zA-Z0-9 .-]+)')
- def table(self, key):
- column1 = self.sel.xpath('//*[@id="col-01"]/div/div/div[2]/dl/dt/text()').extract()
- column2 = self.sel.xpath('//*[@id="col-01"]/div/div/div[2]/dl/dd/text()').extract()
- column2 = list(map(lambda x: x.strip(), column2))
- table_dict = dict(zip(column1, column2))
- return table_dict.get(key, '-')
- def folder(self):
- dt = datetime.strptime(self.dic['date_of_auction'], '%b %d %Y')
- dt2 = datetime.strftime(dt, '%d%m%Y')
- return '{0}{1}{2}'.format(self.dic['type_auc'], str(dt2), str(self.dic['lot_no']))
- def images_loading(self):
- if not os.path.exists(path[:-1]):
- os.makedirs(path[:-1])
- watermark = Image.open(path + 'watermark.png')
- name = self.folder()
- images = self.sel.xpath('//ul[@class="car-gallery"]/li/a/@href').extract()
- self.refresh_folder(len(images))
- if not os.path.exists(path + name) and bool(images):
- os.makedirs(path + name)
- images = self.sel.xpath('//ul[@class="car-gallery"]/li/a/@href').extract()
- images = list(map(lambda x: re.sub('height=([0-9]+)', 'height=600', x), images))
- images = list(map(lambda x: re.sub('width=([0-9]+)', 'width=800', x), images))
- for num, image in enumerate(images, 1):
- urlretrieve(image, path + name + '/' + str(num) + 'w.jpg')
- im2 = Image.open(path + name + '/' + str(num) + 'w.jpg')
- self.add_watermark(im2, watermark).save(path + name + '/' + str(num) + '.jpg')
- os.remove(path + name + '/' + str(num) + 'w.jpg')
- if bool(self.dic['rtr']):
- if not os.path.exists(path + name + '/' + name + 'R.jpg'):
- urlretrieve(self.dic['rtr'], path + name + '/' + name + 'R.jpg')
- if bool(self.dic['inspect']):
- if not os.path.exists(path + name + '/' + name + 'I.jpg'):
- urlretrieve(self.dic['inspect'], path + name + '/' + name + 'I.jpg')
- def refresh_folder(self, count):
- name = self.folder()
- try:
- image_folder = [files for files in os.listdir(path + name) if files.endswith('.jpg')]
- if bool(self.dic['inspect']):
- count += 1
- if bool(self.dic['rtr']):
- count += 1
- if len(image_folder) < count:
- rmtree(path + name)
- except OSError:
- pass
- def add_watermark(self, image, watermark, opacity=1, wm_interval=0):
- assert opacity >= 0 and opacity <= 1
- if opacity < 1:
- if watermark.mode != 'RGBA':
- watermark = watermark.convert('RGBA')
- else:
- watermark = watermark.copy()
- alpha = watermark.split()[3]
- alpha = ImageEnhance.Brightness(alpha).enhance(opacity)
- watermark.putalpha(alpha)
- layer = Image.new('RGBA', image.size, (0,0,0,0))
- for y in range(0, image.size[1], watermark.size[1]+wm_interval):
- for x in range(0, image.size[0], watermark.size[0]+wm_interval):
- layer.paste(watermark, (x, y))
- return Image.composite(layer, image, layer)
- if __name__ == '__main__':
- options = {
- 'CONCURRENT_ITEMS': 250,
- 'USER_AGENT': 'Googlebot/2.1 (+http://www.google.com/bot.html)',
- 'CONCURRENT_REQUESTS': 30,
- 'DOWNLOAD_DELAY': 0.5
- }
- spider = AutoSpider()
- crawler = Crawler(Settings())
- crawler.configure()
- crawler.crawl(spider)
- crawler.start()
- reactor.run()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement