Advertisement
Guest User

Scrapy Spider

a guest
Feb 15th, 2014
228
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 9.58 KB | None | 0 0
  1. # *-* coding: utf-8 *-*
  2.  
  3. from scrapy.spider import Spider
  4. from scrapy.selector import Selector
  5. from scrapy.crawler import Crawler
  6. from scrapy.settings import Settings
  7. from scrapy.http import Request
  8.  
  9. import peewee
  10. import os.path
  11. import re
  12.  
  13. from urllib import urlretrieve
  14. from datetime import datetime
  15. from twisted.internet import reactor
  16. from PIL import Image, ImageEnhance
  17. from shutil import rmtree
  18. from peewee import *
  19.  
  20. # Database (test params)
  21. db = MySQLDatabase('scrapy', user='root', passwd='')
  22. path = 'mthimages/'
  23.  
  24.  
  25. # Create column to database
  26. class SpiderBase(peewee.Model):
  27.     date_of_auction = peewee.CharField(max_length=14)
  28.     auction_type = peewee.TextField()
  29.     location = peewee.TextField()
  30.     auction_time = peewee.CharField(max_length=6)
  31.     lot_no = peewee.CharField(max_length=3)
  32.     year = peewee.CharField(max_length=4)
  33.     make = peewee.TextField()
  34.     model = peewee.TextField()
  35.     body = peewee.TextField()
  36.     km = peewee.TextField()
  37.     engine = peewee.TextField()
  38.     reg = peewee.TextField()
  39.     registration_province = peewee.TextField()
  40.     registration_expiry = peewee.CharField(max_length=8)
  41.     owners = peewee.TextField()
  42.     drive = peewee.TextField()
  43.     fees = peewee.TextField()
  44.     grade = peewee.CharField(max_length=1)
  45.     type_auc = peewee.CharField(max_length=3)
  46.     link = peewee.TextField()
  47.     folder = peewee.CharField(max_length=20)
  48.  
  49.     class Meta:
  50.         database = db
  51.  
  52. try:
  53.     SpiderBase.create_table()
  54. except peewee.InternalError:
  55.     pass
  56.  
  57. list_links = [l.link for l in SpiderBase.select()]
  58.  
  59.  
  60. # Spider Scrapy
  61. class AutoSpider(Spider):
  62.     name = 'auto'
  63.     allowed_domains = ["site.com"]
  64.     start_urls = [
  65.         "site.com"
  66.     ]
  67.  
  68.     def parse(self, response):
  69.         sel = Selector(response)
  70.         elements = ' '.join(sel.xpath('//*[@id="col-01"]/div[1]/ul[1]/li[4]/div[2]/strong/text()').extract())
  71.         links = sel.xpath('//*[@id="col-01"]/div/div/ul/li/a/@href').extract()
  72.         for link in links:
  73.             link = '/'.join(link.split('/')[:-1]) + '/{0}/{0}'.format(str(elements)) + '/'
  74.             yield Request(url=link, callback=self.parse_catalog)
  75.  
  76.     def parse_catalog(self, response):
  77.         sel = Selector(response)
  78.         links_auto = sel.xpath('//div[@class="car-detail-list"]/a/@href').extract()
  79.         if bool(links_auto):
  80.             for link_a in links_auto:
  81.                 if link_a not in list_links:
  82.                     print 'Link {0} sent for processing'.format(link_a)
  83.                     yield Request(url=link_a, callback=self.parse_page)
  84.                 else:
  85.                     print 'Link {0} there are in table'.format(link_a)
  86.  
  87.     def parse_page(self, response):
  88.         self.sel = Selector(response)
  89.         dic = {}
  90.         dic['date_of_auction'] = ' '.join(self.sel.xpath('//*[@id="col-01"]/div/ul[1]/li[1]/descendant::text()').extract())
  91.         dic['auction_type'] = ' '.join(self.sel.xpath('//*[@id="col-01"]/div/ul[1]/li[3]/h4/descendant::text()').extract())
  92.         dic['location'] = ' '.join(self.sel.xpath('//*[@id="col-01"]/div/ul[1]/li[3]/p[1]/text()').extract())
  93.         dic['auction_time'] = ' '.join(self.sel.xpath('//*[@id="col-01"]/div/ul[1]/li[3]/p[2]/text()').re('(\d{1,2}.\d{2})'))
  94.         dic['lot_no'] = ' '.join(self.sel.xpath('//*[@id="col-01"]/div/h3/span/text()').re('\d+'))
  95.         dic['year'] = ' '.join(self.year())
  96.         dic['make'] = ' '.join(self.make())
  97.         dic['model'] = ' '.join(self.model())
  98.         dic['body'] = self.table('Body :')
  99.         dic['km'] = self.table('KM. :')
  100.         dic['engine'] = self.table('Engine :')
  101.         dic['colour'] = self.table('Colour :')
  102.         dic['reg'] = self.table('Reg. :')
  103.         dic['registration_province'] = self.table('Reg.  Province :')
  104.         dic['registration_expiry'] = self.table('Reg.  Expiry :')
  105.         dic['owners'] = self.table('Owners :')
  106.         dic['drive'] = self.table('Drive :')
  107.         dic['fees'] = ' '.join(self.sel.xpath('//*[@id="col-01"]/div/div/div[2]/p[2]/text()').re('\w+'))
  108.         dic['grade'] = ' '.join(self.sel.xpath('//*[@id="col-01"]/div/div/div[2]/div[2]/@class').re('-([a-z])')).upper()
  109.         dic['rtr'] = ' '.join(self.sel.xpath('//*[@id="col-01"]/div/div/div[2]/div[2]/div/a[1]/@href').extract())
  110.         dic['inspect'] = ' '.join(self.sel.xpath('//*[@id="col-01"]/div/div/div[2]/div[2]/div/a[2]/@href').extract())
  111.         dic['type_auc'] = ''.join(self.sel.xpath('//*[@id="col-01"]/div/ul[1]/li[4]/div[1]/figure/img[contains(@src,'
  112.                                                  ' "-active")]/@title').re('[A-Z]+'))
  113.         self.dic = dic
  114.         """
  115.        self.images_loading()
  116.        print 'Images from {0} uploaded in folder {1}'.format(response.url, self.folder())
  117.        """
  118.  
  119.         col = SpiderBase(
  120.             date_of_auction=dic['date_of_auction'],
  121.             auction_type=dic['auction_type'],
  122.             location=dic['location'],
  123.             auction_time=dic['auction_time'],
  124.             lot_no=dic['lot_no'],
  125.             year=dic['year'],
  126.             make=dic['make'],
  127.             model=dic['model'],
  128.             body=dic['body'],
  129.             km=dic['km'],
  130.             engine=dic['engine'],
  131.             reg=dic['reg'],
  132.             registration_province=dic['registration_province'],
  133.             registration_expiry=dic['registration_expiry'],
  134.             owners=dic['owners'],
  135.             drive=dic['drive'],
  136.             fees=dic['fees'],
  137.             grade=dic['grade'],
  138.             type_auc=dic['type_auc'],
  139.             link=response.url,
  140.             folder=self.folder()
  141.         )
  142.         col.save()
  143.         print 'Link {0} added to database'.format(response.url)
  144.  
  145.     def lot(self):
  146.         return self.sel.xpath('//*[@class="helv"]/span/text()')[0].re('\d+')
  147.  
  148.     def year(self):
  149.         self.helv_text = self.sel.xpath('//*[@class="helv"]/text()')
  150.         return self.helv_text.re('\d{4}')
  151.  
  152.     def make(self):
  153.         return self.helv_text.re('([a-zA-Z0-9]+)\s\:')
  154.  
  155.     def model(self):
  156.         return self.helv_text.re(':\s+([a-zA-Z0-9 .-]+)')
  157.  
  158.     def table(self, key):
  159.         column1 = self.sel.xpath('//*[@id="col-01"]/div/div/div[2]/dl/dt/text()').extract()
  160.         column2 = self.sel.xpath('//*[@id="col-01"]/div/div/div[2]/dl/dd/text()').extract()
  161.         column2 = list(map(lambda x: x.strip(), column2))
  162.         table_dict = dict(zip(column1, column2))
  163.         return table_dict.get(key, '-')
  164.  
  165.     def folder(self):
  166.         dt = datetime.strptime(self.dic['date_of_auction'], '%b %d %Y')
  167.         dt2 = datetime.strftime(dt, '%d%m%Y')
  168.         return '{0}{1}{2}'.format(self.dic['type_auc'], str(dt2), str(self.dic['lot_no']))
  169.  
  170.     def images_loading(self):
  171.         if not os.path.exists(path[:-1]):
  172.             os.makedirs(path[:-1])
  173.         watermark = Image.open(path + 'watermark.png')
  174.         name = self.folder()
  175.         images = self.sel.xpath('//ul[@class="car-gallery"]/li/a/@href').extract()
  176.         self.refresh_folder(len(images))
  177.         if not os.path.exists(path + name) and bool(images):
  178.             os.makedirs(path + name)
  179.             images = self.sel.xpath('//ul[@class="car-gallery"]/li/a/@href').extract()
  180.             images = list(map(lambda x: re.sub('height=([0-9]+)', 'height=600', x), images))
  181.             images = list(map(lambda x: re.sub('width=([0-9]+)', 'width=800', x), images))
  182.             for num, image in enumerate(images, 1):
  183.                 urlretrieve(image, path + name + '/' + str(num) + 'w.jpg')
  184.                 im2 = Image.open(path + name + '/' + str(num) + 'w.jpg')
  185.                 self.add_watermark(im2, watermark).save(path + name + '/' + str(num) + '.jpg')
  186.                 os.remove(path + name + '/' + str(num) + 'w.jpg')
  187.         if bool(self.dic['rtr']):
  188.             if not os.path.exists(path + name + '/' + name + 'R.jpg'):
  189.                 urlretrieve(self.dic['rtr'], path + name + '/' + name + 'R.jpg')
  190.         if bool(self.dic['inspect']):
  191.             if not os.path.exists(path + name + '/' + name + 'I.jpg'):
  192.                 urlretrieve(self.dic['inspect'], path + name + '/' + name + 'I.jpg')
  193.  
  194.     def refresh_folder(self, count):
  195.         name = self.folder()
  196.         try:
  197.             image_folder = [files for files in os.listdir(path + name) if files.endswith('.jpg')]
  198.             if bool(self.dic['inspect']):
  199.                 count += 1
  200.             if bool(self.dic['rtr']):
  201.                 count += 1
  202.             if len(image_folder) < count:
  203.                 rmtree(path + name)
  204.         except OSError:
  205.             pass
  206.  
  207.     def add_watermark(self, image, watermark, opacity=1, wm_interval=0):
  208.         assert opacity >= 0 and opacity <= 1
  209.         if opacity < 1:
  210.             if watermark.mode != 'RGBA':
  211.                 watermark = watermark.convert('RGBA')
  212.             else:
  213.                 watermark = watermark.copy()
  214.             alpha = watermark.split()[3]
  215.             alpha = ImageEnhance.Brightness(alpha).enhance(opacity)
  216.             watermark.putalpha(alpha)
  217.         layer = Image.new('RGBA', image.size, (0,0,0,0))
  218.         for y in range(0, image.size[1], watermark.size[1]+wm_interval):
  219.             for x in range(0, image.size[0], watermark.size[0]+wm_interval):
  220.                 layer.paste(watermark, (x, y))
  221.         return Image.composite(layer,  image,  layer)
  222.  
  223. if __name__ == '__main__':
  224.     options = {
  225.         'CONCURRENT_ITEMS': 250,
  226.         'USER_AGENT': 'Googlebot/2.1 (+http://www.google.com/bot.html)',
  227.         'CONCURRENT_REQUESTS': 30,
  228.         'DOWNLOAD_DELAY': 0.5
  229.     }
  230.  
  231.     spider = AutoSpider()
  232.     crawler = Crawler(Settings())
  233.     crawler.configure()
  234.     crawler.crawl(spider)
  235.     crawler.start()
  236.     reactor.run()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement