Guest User

Scrapy Spider

a guest
Feb 15th, 2014
428
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 9.93 KB | None | 0 0
  1. # *-* coding: utf-8 *-*
  2.  
  3. from scrapy.spider import Spider
  4. from scrapy.selector import Selector
  5. from scrapy.crawler import Crawler
  6. from scrapy.settings import Settings
  7. from scrapy.http import Request
  8. from scrapy import log, signals
  9. from scrapy.utils.project import get_project_settings
  10.  
  11. import peewee
  12. import os.path
  13. import re
  14.  
  15. from urllib import urlretrieve
  16. from datetime import datetime
  17. from twisted.internet import reactor
  18. from PIL import Image, ImageEnhance
  19. from shutil import rmtree
  20. from peewee import *
  21.  
  22. # Database (test params)
  23. db = MySQLDatabase('scrapy', user='root', passwd='')
  24. path = 'mthimages/'
  25.  
  26.  
  27. # Create column to database
  28. class SpiderBase(peewee.Model):
  29.     date_of_auction = peewee.CharField(max_length=14)
  30.     auction_type = peewee.TextField()
  31.     location = peewee.TextField()
  32.     auction_time = peewee.CharField(max_length=6)
  33.     lot_no = peewee.CharField(max_length=3)
  34.     year = peewee.CharField(max_length=4)
  35.     make = peewee.TextField()
  36.     model = peewee.TextField()
  37.     body = peewee.TextField()
  38.     km = peewee.TextField()
  39.     engine = peewee.TextField()
  40.     reg = peewee.TextField()
  41.     registration_province = peewee.TextField()
  42.     registration_expiry = peewee.CharField(max_length=8)
  43.     owners = peewee.TextField()
  44.     drive = peewee.TextField()
  45.     fees = peewee.TextField()
  46.     grade = peewee.CharField(max_length=1)
  47.     link = peewee.TextField()
  48.     folder = peewee.CharField(max_length=30)
  49.  
  50.     class Meta:
  51.         database = db
  52.  
  53. try:
  54.     SpiderBase.create_table()
  55. except peewee.InternalError:
  56.     pass
  57.  
  58. list_links = [l.link for l in SpiderBase.select()]
  59.  
  60.  
  61. # Spider Scrapy
  62. class AutoSpider(Spider):
  63.     name = 'auto'
  64.     allowed_domains = ["manheimthailand.com"]
  65.     start_urls = [
  66.         "http://www.manheimthailand.com/en/site/calendar"
  67.     ]
  68.  
  69.     def parse(self, response):
  70.         sel = Selector(response)
  71.         elements = ' '.join(sel.xpath('//*[@id="col-01"]/div[1]/ul[1]/li[4]/div[2]/strong/text()').extract())
  72.         links = sel.xpath('//*[@id="col-01"]/div/div/ul/li/a/@href').extract()
  73.         for link in links:
  74.             link = '/'.join(link.split('/')[:-1]) + '/{0}/{0}'.format(str(elements)) + '/'
  75.             yield Request(url=link, callback=self.parse_catalog)
  76.  
  77.     def parse_catalog(self, response):
  78.         sel = Selector(response)
  79.         links_auto = sel.xpath('//div[@class="car-detail-list"]/a/@href').extract()
  80.         if bool(links_auto):
  81.             for link_a in links_auto:
  82.                 if link_a not in list_links:
  83.                     print 'Link {0} sent for processing'.format(link_a)
  84.                     yield Request(url=link_a, callback=self.parse_page)
  85.                 else:
  86.                     print 'Link {0} there are in table'.format(link_a)
  87.  
  88.     def parse_page(self, response):
  89.         self.sel = Selector(response)
  90.         dic = {}
  91.         dic['date_of_auction'] = ' '.join(self.sel.xpath('//*[@id="col-01"]/div/ul[1]/li[1]/descendant::text()').extract())
  92.         dic['auction_type'] = ' '.join(self.sel.xpath('//*[@id="col-01"]/div/ul[1]/li[3]/h4/descendant::text()').extract())
  93.         dic['location'] = ' '.join(self.sel.xpath('//*[@id="col-01"]/div/ul[1]/li[3]/p[1]/text()').extract())
  94.         dic['auction_time'] = ' '.join(self.sel.xpath('//*[@id="col-01"]/div/ul[1]/li[3]/p[2]/text()').re('(\d{1,2}.\d{2})'))
  95.         dic['lot_no'] = ' '.join(self.sel.xpath('//*[@id="col-01"]/div/h3/span/text()').re('\d+'))
  96.         dic['year'] = ' '.join(self.year())
  97.         dic['make'] = ' '.join(self.make())
  98.         dic['model'] = ' '.join(self.model())
  99.         dic['body'] = self.table('Body :')
  100.         dic['km'] = self.table('KM. :')
  101.         dic['engine'] = self.table('Engine :')
  102.         dic['colour'] = self.table('Colour :')
  103.         dic['reg'] = self.table('Reg. :')
  104.         dic['registration_province'] = self.table('Reg.  Province :')
  105.         dic['registration_expiry'] = self.table('Reg.  Expiry :')
  106.         dic['owners'] = self.table('Owners :')
  107.         dic['drive'] = self.table('Drive :')
  108.         dic['fees'] = ' '.join(self.sel.xpath('//*[@id="col-01"]/div/div/div[2]/p[2]/text()').re('\w+'))
  109.         dic['grade'] = ' '.join(self.sel.xpath('//*[@id="col-01"]/div/div/div[2]/div[2]/@class').re('-([a-z])')).upper()
  110.         dic['rtr'] = ' '.join(self.sel.xpath('//*[@id="col-01"]/div/div/div[2]/div[2]/div/a[1]/@href').extract())
  111.         dic['inspect'] = ' '.join(self.sel.xpath('//*[@id="col-01"]/div/div/div[2]/div[2]/div/a[2]/@href').extract())
  112.         dic['lane'] = ' '.join(self.sel.xpath('//li[@class="lane"]/strong/text()').extract())
  113.  
  114.         self.num_auc = response.url.split('/')[-1]
  115.         self.dic = dic
  116.         """
  117.        self.images_loading()
  118.        print 'Images from {0} uploaded in folder {1}'.format(response.url, self.folder())
  119.        """
  120.  
  121.         col = SpiderBase(
  122.             date_of_auction=dic['date_of_auction'],
  123.             auction_type=dic['auction_type'],
  124.             location=dic['location'],
  125.             auction_time=dic['auction_time'],
  126.             lot_no=dic['lot_no'],
  127.             year=dic['year'],
  128.             make=dic['make'],
  129.             model=dic['model'],
  130.             body=dic['body'],
  131.             km=dic['km'],
  132.             engine=dic['engine'],
  133.             reg=dic['reg'],
  134.             registration_province=dic['registration_province'],
  135.             registration_expiry=dic['registration_expiry'],
  136.             owners=dic['owners'],
  137.             drive=dic['drive'],
  138.             fees=dic['fees'],
  139.             grade=dic['grade'],
  140.             link=response.url,
  141.             folder=self.folder()
  142.         )
  143.         col.save()
  144.         print 'Link {0} added to database'.format(response.url)
  145.  
  146.     def lot(self):
  147.         return self.sel.xpath('//*[@class="helv"]/span/text()')[0].re('\d+')
  148.  
  149.     def year(self):
  150.         self.helv_text = self.sel.xpath('//*[@class="helv"]/text()')
  151.         return self.helv_text.re('\d{4}')
  152.  
  153.     def make(self):
  154.         return self.helv_text.re('([a-zA-Z0-9]+)\s\:')
  155.  
  156.     def model(self):
  157.         return self.helv_text.re(':\s+([a-zA-Z0-9 .-]+)')
  158.  
  159.     def table(self, key):
  160.         column1 = self.sel.xpath('//*[@id="col-01"]/div/div/div[2]/dl/dt/text()').extract()
  161.         column2 = self.sel.xpath('//*[@id="col-01"]/div/div/div[2]/dl/dd/text()').extract()
  162.         column2 = list(map(lambda x: x.strip(), column2))
  163.         table_dict = dict(zip(column1, column2))
  164.         return table_dict.get(key, '-')
  165.  
  166.     def folder(self):
  167.         dt = datetime.strptime(self.dic['date_of_auction'], '%b %d %Y')
  168.         dt2 = datetime.strftime(dt, '%d%m%y')
  169.         return '{0}_{1}_{2}'.format(str(dt2), self.dic['lane'], self.num_auc)
  170.  
  171.     def images_loading(self):
  172.         if not os.path.exists(path[:-1]):
  173.             os.makedirs(path[:-1])
  174.         watermark = Image.open(path + 'watermark.png')
  175.         name = self.folder()
  176.         images = self.sel.xpath('//ul[@class="car-gallery"]/li/a/@href').extract()
  177.         self.refresh_folder(len(images))
  178.         if not os.path.exists(path + name) and bool(images):
  179.             os.makedirs(path + name)
  180.             images = self.sel.xpath('//ul[@class="car-gallery"]/li/a/@href').extract()
  181.             images = list(map(lambda x: re.sub('height=([0-9]+)', 'height=600', x), images))
  182.             images = list(map(lambda x: re.sub('width=([0-9]+)', 'width=800', x), images))
  183.             for num, image in enumerate(images, 1):
  184.                 urlretrieve(image, path + name + '/' + str(num) + 'w.jpg')
  185.                 im2 = Image.open(path + name + '/' + str(num) + 'w.jpg')
  186.                 self.add_watermark(im2, watermark).save(path + name + '/' + str(num) + '.jpg')
  187.                 os.remove(path + name + '/' + str(num) + 'w.jpg')
  188.         if bool(self.dic['rtr']):
  189.             if not os.path.exists(path + name + '/' + name + 'R.jpg'):
  190.                 urlretrieve(self.dic['rtr'], path + name + '/' + name + 'R.jpg')
  191.         if bool(self.dic['inspect']):
  192.             if not os.path.exists(path + name + '/' + name + 'I.jpg'):
  193.                 urlretrieve(self.dic['inspect'], path + name + '/' + name + 'I.jpg')
  194.  
  195.     def refresh_folder(self, count):
  196.         name = self.folder()
  197.         try:
  198.             image_folder = [files for files in os.listdir(path + name) if files.endswith('.jpg')]
  199.             if bool(self.dic['inspect']):
  200.                 count += 1
  201.             if bool(self.dic['rtr']):
  202.                 count += 1
  203.             if len(image_folder) < count:
  204.                 rmtree(path + name)
  205.         except OSError:
  206.             pass
  207.  
  208.     def add_watermark(self, image, watermark, opacity=1, wm_interval=0):
  209.         assert opacity >= 0 and opacity <= 1
  210.         if opacity < 1:
  211.             if watermark.mode != 'RGBA':
  212.                 watermark = watermark.convert('RGBA')
  213.             else:
  214.                 watermark = watermark.copy()
  215.             alpha = watermark.split()[3]
  216.             alpha = ImageEnhance.Brightness(alpha).enhance(opacity)
  217.             watermark.putalpha(alpha)
  218.         layer = Image.new('RGBA', image.size, (0,0,0,0))
  219.         for y in range(0, image.size[1], watermark.size[1]+wm_interval):
  220.             for x in range(0, image.size[0], watermark.size[0]+wm_interval):
  221.                 layer.paste(watermark, (x, y))
  222.         return Image.composite(layer,  image,  layer)
  223.  
  224. if __name__ == '__main__':
  225.     options = {
  226.         'CONCURRENT_ITEMS': 250,
  227.         'USER_AGENT': 'Googlebot/2.1 (+http://www.google.com/bot.html)',
  228.         'CONCURRENT_REQUESTS': 30,
  229.         'DOWNLOAD_DELAY': 0.5,
  230.         'COOKIES_ENABLED': False,
  231.         'CLOSESPIDER_PAGECOUNT': 1000,
  232.         'CLOSESPIDER_TIMEOUT': 3600,
  233.     }
  234.  
  235.     spider = AutoSpider()
  236.     settings = get_project_settings()
  237.     settings.overrides.update(options)
  238.     crawler = Crawler(settings)
  239.     crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
  240.     crawler.install()
  241.     crawler.configure()
  242.     crawler.crawl(spider)
  243.     crawler.start()
  244.     log.start(logfile="results.log", loglevel=log.DEBUG, crawler=crawler, logstdout=False)
  245.     reactor.run()
Advertisement
Add Comment
Please, Sign In to add comment