Advertisement
kamegami

mh4armorcrawler.py

Jan 20th, 2015
218
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 11.40 KB | None | 0 0
  1. import scrapy
  2. from scrapy.contrib.spiders import CrawlSpider, Rule
  3. from scrapy.http import Request
  4. from scrapy.contrib.linkextractors import LinkExtractor
  5. from scrapy.selector import Selector
  6. from selenium import webdriver
  7. import time
  8.  
  9.  
  10. class ArmorPiece(scrapy.Item):
  11.     url = scrapy.Field()
  12.     id = scrapy.Field()
  13.     series = scrapy.Field()
  14.     jpn_series = scrapy.Field()
  15.     type = scrapy.Field()
  16.     part = scrapy.Field()
  17.     name = scrapy.Field()
  18.     jpn_name = scrapy.Field()
  19.     gender = scrapy.Field()
  20.     rarity = scrapy.Field()
  21.     fire = scrapy.Field()
  22.     water = scrapy.Field()
  23.     ice = scrapy.Field()
  24.     thunder = scrapy.Field()
  25.     dragon = scrapy.Field()
  26.     defense = scrapy.Field()
  27.     maxed = scrapy.Field()
  28.     slots = scrapy.Field()
  29.     skills = scrapy.Field()
  30.     #skill_1 = scrapy.Field()
  31.     skill_id_1 = scrapy.Field()
  32.     points_1 = scrapy.Field()
  33.     #skill_2 = scrapy.Field()
  34.     skill_id_2 = scrapy.Field()
  35.     points_2 = scrapy.Field()
  36.     #skill_3 = scrapy.Field()
  37.     skill_id_3 = scrapy.Field()
  38.     points_3 = scrapy.Field()
  39.     #skill_4 = scrapy.Field()
  40.     skill_id_4 = scrapy.Field()
  41.     points_4 = scrapy.Field()
  42.     #skill_5 = scrapy.Field()
  43.     skill_id_5 = scrapy.Field()
  44.     points_5 = scrapy.Field()
  45.     create = scrapy.Field()
  46.     #material_1 = scrapy.Field()
  47.     material_id_1 = scrapy.Field()
  48.     count_1 = scrapy.Field()
  49.     #material_2 = scrapy.Field()
  50.     material_id_2 = scrapy.Field()
  51.     count_2 = scrapy.Field()
  52.     #material_3 = scrapy.Field()
  53.     material_id_3 = scrapy.Field()
  54.     count_3 = scrapy.Field()
  55.     #material_4 = scrapy.Field()
  56.     material_id_4 = scrapy.Field()
  57.     count_4 = scrapy.Field()
  58.  
  59. #scrapy.Spider
  60. #class MH4GSpider(CrawlSpider):
  61. class MH4GArmorSpider(CrawlSpider):
  62.     name = "crawl_mh4garmor"
  63.     allowed_domains = ['wiki.mh4g.org']
  64.     #allowed_domains = ['translate.aegil.net/']
  65.     start_urls = ['http://wiki.mh4g.org/data/1445.html']
  66.     #start_urls = ['http://wiki.mh4g.org/ida/132649.html']
  67.     #start_urls = ['http://wiki.mh4g.org/ida/73595.html']
  68.     #start_urls = ['http://wiki.mh4g.org/data/1445.html']
  69.     #start_urls = ['http://translate.aegil.net/index.html?l=http://wiki.mh4g.org/data/1512.html']
  70.  
  71.     def __init__(self):
  72.         CrawlSpider.__init__(self)
  73.         self.br = webdriver.Firefox()
  74.  
  75.     def slot(self, item):
  76.         item = u' '.join(unicode(u' '.join(item)).split())
  77.         if item == u'\u25cb \u25cb -':
  78.             return u'2'
  79.         elif item == u'\u25cb - -':
  80.             return u'1'
  81.         elif item == u'- - -':
  82.             return u'0'
  83.         elif item == u'\u25cb \u25cb \u25cb':
  84.             return u'3'
  85.  
  86.     def first(self, item):
  87.         if item:
  88.             return u' '.join(unicode(u' '.join(item)).split())
  89.         else:
  90.             return u''
  91.  
  92.     def gender(self, item):
  93.         item = u' '.join(unicode(u' '.join(item)).split())
  94.         if item == u'\u7537\u5973\u5171\u7528':
  95.             return u'Unisex'
  96.         elif item == u'\u7537\u6027\u5c02\u7528':
  97.             return u'Male'
  98.         elif item == u'\u5973\u6027\u5c02\u7528':
  99.             return u'Female'
  100.         else:
  101.             return u''
  102.  
  103.     def series(self, item):
  104.         item = u' '.join(unicode(u' '.join(item)).split())
  105.         return item.replace(u'\u306e\u60c5\u5831', u'')
  106.  
  107.     def eng_series(self, item):
  108.         item = u' '.join(unicode(u' '.join(item)).split())
  109.         return item.replace(u' Info', u'')
  110.  
  111.     def count(self, item):
  112.         item = u' '.join(unicode(u' '.join(item)).split())
  113.         return item.replace(u'x', u'')
  114.  
  115.     def part(self, item):
  116.         item = u' '.join(unicode(u' '.join(item)).split())
  117.         if item == u'\u982d':
  118.             return u'Head'
  119.         elif item == u'\u80f4':
  120.             return u'Body'
  121.         elif item == u'\u8155':
  122.             return u'Arm'
  123.         elif item == u'\u8170':
  124.             return u'Waist'
  125.         elif item == u'\u811a':
  126.             return u'Leg'
  127.         else:
  128.             return ''
  129.  
  130.     def armortype(self, item):
  131.         item = u' '.join(unicode(u' '.join(item)).split())
  132.         if item == u'[\u5263\u58eb\u7528]':
  133.             return u'Melee'
  134.         elif item == u'[\u30ac\u30f3\u30ca\u30fc\u7528]':
  135.             return u'Gunner'
  136.         elif item == u'[\u5171\u7528]':
  137.             return u'Both'
  138.         else:
  139.             return u''
  140.  
  141.     def parse(self, response):
  142.         links = response.xpath('//table//td//a//@href').extract()
  143.         #del links[5:]
  144.         for m in links:
  145.             murl = 'http://wiki.mh4g.org' + m
  146.             yield Request(url = murl, callback = self.parse_armor)
  147.  
  148.     def parse_armor(self, response):
  149.         items = []
  150.         skill_count = 2
  151.         self.br.get('http://translate.aegil.net/index.html?l=' + response.url)
  152.         if('Some pages can take a while' in self.br.page_source):
  153.             time.sleep(3)
  154.         hxs = Selector(text=self.br.page_source)
  155.         skill_count = 2
  156.         for count in range(2,12):
  157.             type_row = False
  158.             dash = self.first(response.xpath('(((//table)[1]//table)[1]//tr)['+str(skill_count)+']/td[2]/text()').extract())
  159.             if count == 2:
  160.                 type_row = True
  161.             if dash == u'':
  162.                 skill_count += 1
  163.                 type_row = True
  164.                 dash = self.first(response.xpath('(((//table)[1]//table)[1]//tr)['+str(skill_count)+']/td[2]/text()').extract())
  165.             if dash != u'':
  166.                 items.append({'type_row': type_row,
  167.                               'info': response.xpath('(((//table)[1]//table)[1]//tr)['+str(skill_count)+']'),
  168.                               'eng_info': hxs.xpath('(((//table)[2]//table)[1]//tr)['+str(skill_count)+']'),
  169.                               'skills': response.xpath('(((//table)[1]//table)[3]//tr)['+str(skill_count)+']'),
  170.                               'create': response.xpath('(((//table)[1]//table)[4]//tr)['+str(count)+']')})
  171.             skill_count += 1
  172.  
  173.         jpn_series = self.series(response.xpath(
  174.             '//*[@id="data2"]/h3[1]/text()').extract())
  175.         series = self.eng_series(hxs.xpath(
  176.             '//*[@id="data2"]/h3[1]/text()').extract())
  177.         count = 1
  178.         for item in items:
  179.             piece = ArmorPiece()
  180.             piece['url'] = response.url
  181.             if item['type_row']:
  182.                 ptype = self.armortype(item['info'].xpath('td[1]/text()').extract())
  183.                 piece['type'] = ptype
  184.                 piece['part'] = self.part(item['info'].xpath('td[2]//text()').extract())
  185.                 piece['name'] = self.first(item['eng_info'].xpath('td[3]//text()').extract())
  186.                 piece['jpn_name'] = self.first(item['info'].xpath('td[3]//text()').extract())
  187.                 piece['gender'] = self.gender(item['info'].xpath('td[4]//text()').extract())
  188.                 piece['rarity'] = self.first(item['info'].xpath('td[5]//text()').extract())
  189.                 piece['fire'] = self.first(item['info'].xpath('td[6]/span/span/text()').extract())
  190.                 piece['water'] = self.first(item['info'].xpath('td[7]/span/span/text()').extract())
  191.                 piece['ice'] = self.first(item['info'].xpath('td[8]/span/span/text()').extract())
  192.                 piece['thunder'] = self.first(item['info'].xpath('td[9]/span/span/text()').extract())
  193.                 piece['dragon'] = self.first(item['info'].xpath('td[10]/span/span/text()').extract())
  194.                 piece['defense'] = self.first(item['info'].xpath('td[11]/text()').extract())
  195.                 piece['maxed'] = self.first(item['info'].xpath('td[12]/text()').extract())
  196.             else:
  197.                 piece['type'] = ptype
  198.                 piece['part'] = self.part(item['info'].xpath('td[1]//text()').extract())
  199.                 piece['name'] = self.first(item['eng_info'].xpath('td[2]//text()').extract())
  200.                 piece['jpn_name'] = self.first(item['info'].xpath('td[2]//text()').extract())
  201.                 piece['gender'] = self.gender(item['info'].xpath('td[3]//text()').extract())
  202.                 piece['rarity'] = self.first(item['info'].xpath('td[4]//text()').extract())
  203.                 piece['fire'] = self.first(item['info'].xpath('td[5]/span/span/text()').extract())
  204.                 piece['water'] = self.first(item['info'].xpath('td[6]/span/span/text()').extract())
  205.                 piece['ice'] = self.first(item['info'].xpath('td[7]/span/span/text()').extract())
  206.                 piece['thunder'] = self.first(item['info'].xpath('td[8]/span/span/text()').extract())
  207.                 piece['dragon'] = self.first(item['info'].xpath('td[9]/span/span/text()').extract())
  208.                 piece['defense'] = self.first(item['info'].xpath('td[10]/text()').extract())
  209.                 piece['maxed'] = self.first(item['info'].xpath('td[11]/text()').extract())
  210.             piece['slots'] = self.slot(item['skills'].xpath('td[2]/text()').extract())
  211.             piece['id'] = response.url.replace('http://wiki.mh4g.org/ida/', '').replace('.html', '')
  212.             piece['jpn_series'] = jpn_series
  213.             piece['series'] = series
  214.             piece['skills'] = self.first(item['skills'].xpath('td[3]').extract())
  215.             piece['skill_id_1'] = self.first(item['skills'].xpath('td[3]/a[1]/@href').extract()).replace('/ida/', '').replace('.html', '')
  216.             piece['points_1'] = self.first(item['skills'].xpath('td[3]/span[1]/text()').extract())
  217.             piece['skill_id_2'] = self.first(item['skills'].xpath('td[3]/a[2]/@href').extract()).replace('/ida/', '').replace('.html', '')
  218.             piece['points_2'] = self.first(item['skills'].xpath('td[3]/span[2]/text()').extract())
  219.             piece['skill_id_3'] = self.first(item['skills'].xpath('td[3]/a[3]/@href').extract()).replace('/ida/', '').replace('.html', '')
  220.             piece['points_3'] = self.first(item['skills'].xpath('td[3]/span[3]/text()').extract())
  221.             piece['skill_id_4'] = self.first(item['skills'].xpath('td[3]/a[4]/@href').extract()).replace('/ida/', '').replace('.html', '')
  222.             piece['points_4'] = self.first(item['skills'].xpath('td[3]/span[4]/text()').extract())
  223.             piece['skill_id_5'] = self.first(item['skills'].xpath('td[3]/a[5]/@href').extract()).replace('/ida/', '').replace('.html', '')
  224.             piece['points_5'] = self.first(item['skills'].xpath('td[3]/span[5]/text()').extract())
  225.             piece['create'] = self.first(item['create'].xpath('td[2]').extract())
  226.             piece['material_id_1'] = self.first(item['create'].xpath('td[2]/a[1]/@href').extract()).replace('/ida/', '').replace('.html', '')
  227.             piece['count_1'] = self.count(item['create'].xpath('(td[2]//text()[normalize-space()])[2]').extract())
  228.             piece['material_id_2'] = self.first(item['create'].xpath('td[2]/a[2]/@href').extract()).replace('/ida/', '').replace('.html', '')
  229.             piece['count_2'] = self.count(item['create'].xpath('(td[2]//text()[normalize-space()])[4]').extract())
  230.             piece['material_id_3'] = self.first(item['create'].xpath('td[2]/a[3]/@href').extract()).replace('/ida/', '').replace('.html', '')
  231.             piece['count_3'] = self.count(item['create'].xpath('(td[2]//text()[normalize-space()])[6]').extract())
  232.             piece['material_id_4'] = self.first(item['create'].xpath('td[2]/a[4]/@href').extract()).replace('/ida/', '').replace('.html', '')
  233.             piece['count_4'] = self.count(item['create'].xpath('(td[2]//text()[normalize-space()])[8]').extract())
  234.             count = count + 1
  235.             yield piece
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement