Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import scrapy
- from scrapy.contrib.spiders import CrawlSpider, Rule
- from scrapy.http import Request
- from scrapy.contrib.linkextractors import LinkExtractor
- from scrapy.selector import Selector
- from selenium import webdriver
- import time
- class ArmorPiece(scrapy.Item):
- url = scrapy.Field()
- id = scrapy.Field()
- series = scrapy.Field()
- jpn_series = scrapy.Field()
- type = scrapy.Field()
- part = scrapy.Field()
- name = scrapy.Field()
- jpn_name = scrapy.Field()
- gender = scrapy.Field()
- rarity = scrapy.Field()
- fire = scrapy.Field()
- water = scrapy.Field()
- ice = scrapy.Field()
- thunder = scrapy.Field()
- dragon = scrapy.Field()
- defense = scrapy.Field()
- maxed = scrapy.Field()
- slots = scrapy.Field()
- skills = scrapy.Field()
- #skill_1 = scrapy.Field()
- skill_id_1 = scrapy.Field()
- points_1 = scrapy.Field()
- #skill_2 = scrapy.Field()
- skill_id_2 = scrapy.Field()
- points_2 = scrapy.Field()
- #skill_3 = scrapy.Field()
- skill_id_3 = scrapy.Field()
- points_3 = scrapy.Field()
- #skill_4 = scrapy.Field()
- skill_id_4 = scrapy.Field()
- points_4 = scrapy.Field()
- #skill_5 = scrapy.Field()
- skill_id_5 = scrapy.Field()
- points_5 = scrapy.Field()
- create = scrapy.Field()
- #material_1 = scrapy.Field()
- material_id_1 = scrapy.Field()
- count_1 = scrapy.Field()
- #material_2 = scrapy.Field()
- material_id_2 = scrapy.Field()
- count_2 = scrapy.Field()
- #material_3 = scrapy.Field()
- material_id_3 = scrapy.Field()
- count_3 = scrapy.Field()
- #material_4 = scrapy.Field()
- material_id_4 = scrapy.Field()
- count_4 = scrapy.Field()
- #scrapy.Spider
- #class MH4GSpider(CrawlSpider):
- class MH4GArmorSpider(CrawlSpider):
- name = "crawl_mh4garmor"
- allowed_domains = ['wiki.mh4g.org']
- #allowed_domains = ['translate.aegil.net/']
- start_urls = ['http://wiki.mh4g.org/data/1445.html']
- #start_urls = ['http://wiki.mh4g.org/ida/132649.html']
- #start_urls = ['http://wiki.mh4g.org/ida/73595.html']
- #start_urls = ['http://wiki.mh4g.org/data/1445.html']
- #start_urls = ['http://translate.aegil.net/index.html?l=http://wiki.mh4g.org/data/1512.html']
- def __init__(self):
- CrawlSpider.__init__(self)
- self.br = webdriver.Firefox()
- def slot(self, item):
- item = u' '.join(unicode(u' '.join(item)).split())
- if item == u'\u25cb \u25cb -':
- return u'2'
- elif item == u'\u25cb - -':
- return u'1'
- elif item == u'- - -':
- return u'0'
- elif item == u'\u25cb \u25cb \u25cb':
- return u'3'
- def first(self, item):
- if item:
- return u' '.join(unicode(u' '.join(item)).split())
- else:
- return u''
- def gender(self, item):
- item = u' '.join(unicode(u' '.join(item)).split())
- if item == u'\u7537\u5973\u5171\u7528':
- return u'Unisex'
- elif item == u'\u7537\u6027\u5c02\u7528':
- return u'Male'
- elif item == u'\u5973\u6027\u5c02\u7528':
- return u'Female'
- else:
- return u''
- def series(self, item):
- item = u' '.join(unicode(u' '.join(item)).split())
- return item.replace(u'\u306e\u60c5\u5831', u'')
- def eng_series(self, item):
- item = u' '.join(unicode(u' '.join(item)).split())
- return item.replace(u' Info', u'')
- def count(self, item):
- item = u' '.join(unicode(u' '.join(item)).split())
- return item.replace(u'x', u'')
- def part(self, item):
- item = u' '.join(unicode(u' '.join(item)).split())
- if item == u'\u982d':
- return u'Head'
- elif item == u'\u80f4':
- return u'Body'
- elif item == u'\u8155':
- return u'Arm'
- elif item == u'\u8170':
- return u'Waist'
- elif item == u'\u811a':
- return u'Leg'
- else:
- return ''
- def armortype(self, item):
- item = u' '.join(unicode(u' '.join(item)).split())
- if item == u'[\u5263\u58eb\u7528]':
- return u'Melee'
- elif item == u'[\u30ac\u30f3\u30ca\u30fc\u7528]':
- return u'Gunner'
- elif item == u'[\u5171\u7528]':
- return u'Both'
- else:
- return u''
- def parse(self, response):
- links = response.xpath('//table//td//a//@href').extract()
- #del links[5:]
- for m in links:
- murl = 'http://wiki.mh4g.org' + m
- yield Request(url = murl, callback = self.parse_armor)
- def parse_armor(self, response):
- items = []
- skill_count = 2
- self.br.get('http://translate.aegil.net/index.html?l=' + response.url)
- if('Some pages can take a while' in self.br.page_source):
- time.sleep(3)
- hxs = Selector(text=self.br.page_source)
- skill_count = 2
- for count in range(2,12):
- type_row = False
- dash = self.first(response.xpath('(((//table)[1]//table)[1]//tr)['+str(skill_count)+']/td[2]/text()').extract())
- if count == 2:
- type_row = True
- if dash == u'':
- skill_count += 1
- type_row = True
- dash = self.first(response.xpath('(((//table)[1]//table)[1]//tr)['+str(skill_count)+']/td[2]/text()').extract())
- if dash != u'':
- items.append({'type_row': type_row,
- 'info': response.xpath('(((//table)[1]//table)[1]//tr)['+str(skill_count)+']'),
- 'eng_info': hxs.xpath('(((//table)[2]//table)[1]//tr)['+str(skill_count)+']'),
- 'skills': response.xpath('(((//table)[1]//table)[3]//tr)['+str(skill_count)+']'),
- 'create': response.xpath('(((//table)[1]//table)[4]//tr)['+str(count)+']')})
- skill_count += 1
- jpn_series = self.series(response.xpath(
- '//*[@id="data2"]/h3[1]/text()').extract())
- series = self.eng_series(hxs.xpath(
- '//*[@id="data2"]/h3[1]/text()').extract())
- count = 1
- for item in items:
- piece = ArmorPiece()
- piece['url'] = response.url
- if item['type_row']:
- ptype = self.armortype(item['info'].xpath('td[1]/text()').extract())
- piece['type'] = ptype
- piece['part'] = self.part(item['info'].xpath('td[2]//text()').extract())
- piece['name'] = self.first(item['eng_info'].xpath('td[3]//text()').extract())
- piece['jpn_name'] = self.first(item['info'].xpath('td[3]//text()').extract())
- piece['gender'] = self.gender(item['info'].xpath('td[4]//text()').extract())
- piece['rarity'] = self.first(item['info'].xpath('td[5]//text()').extract())
- piece['fire'] = self.first(item['info'].xpath('td[6]/span/span/text()').extract())
- piece['water'] = self.first(item['info'].xpath('td[7]/span/span/text()').extract())
- piece['ice'] = self.first(item['info'].xpath('td[8]/span/span/text()').extract())
- piece['thunder'] = self.first(item['info'].xpath('td[9]/span/span/text()').extract())
- piece['dragon'] = self.first(item['info'].xpath('td[10]/span/span/text()').extract())
- piece['defense'] = self.first(item['info'].xpath('td[11]/text()').extract())
- piece['maxed'] = self.first(item['info'].xpath('td[12]/text()').extract())
- else:
- piece['type'] = ptype
- piece['part'] = self.part(item['info'].xpath('td[1]//text()').extract())
- piece['name'] = self.first(item['eng_info'].xpath('td[2]//text()').extract())
- piece['jpn_name'] = self.first(item['info'].xpath('td[2]//text()').extract())
- piece['gender'] = self.gender(item['info'].xpath('td[3]//text()').extract())
- piece['rarity'] = self.first(item['info'].xpath('td[4]//text()').extract())
- piece['fire'] = self.first(item['info'].xpath('td[5]/span/span/text()').extract())
- piece['water'] = self.first(item['info'].xpath('td[6]/span/span/text()').extract())
- piece['ice'] = self.first(item['info'].xpath('td[7]/span/span/text()').extract())
- piece['thunder'] = self.first(item['info'].xpath('td[8]/span/span/text()').extract())
- piece['dragon'] = self.first(item['info'].xpath('td[9]/span/span/text()').extract())
- piece['defense'] = self.first(item['info'].xpath('td[10]/text()').extract())
- piece['maxed'] = self.first(item['info'].xpath('td[11]/text()').extract())
- piece['slots'] = self.slot(item['skills'].xpath('td[2]/text()').extract())
- piece['id'] = response.url.replace('http://wiki.mh4g.org/ida/', '').replace('.html', '')
- piece['jpn_series'] = jpn_series
- piece['series'] = series
- piece['skills'] = self.first(item['skills'].xpath('td[3]').extract())
- piece['skill_id_1'] = self.first(item['skills'].xpath('td[3]/a[1]/@href').extract()).replace('/ida/', '').replace('.html', '')
- piece['points_1'] = self.first(item['skills'].xpath('td[3]/span[1]/text()').extract())
- piece['skill_id_2'] = self.first(item['skills'].xpath('td[3]/a[2]/@href').extract()).replace('/ida/', '').replace('.html', '')
- piece['points_2'] = self.first(item['skills'].xpath('td[3]/span[2]/text()').extract())
- piece['skill_id_3'] = self.first(item['skills'].xpath('td[3]/a[3]/@href').extract()).replace('/ida/', '').replace('.html', '')
- piece['points_3'] = self.first(item['skills'].xpath('td[3]/span[3]/text()').extract())
- piece['skill_id_4'] = self.first(item['skills'].xpath('td[3]/a[4]/@href').extract()).replace('/ida/', '').replace('.html', '')
- piece['points_4'] = self.first(item['skills'].xpath('td[3]/span[4]/text()').extract())
- piece['skill_id_5'] = self.first(item['skills'].xpath('td[3]/a[5]/@href').extract()).replace('/ida/', '').replace('.html', '')
- piece['points_5'] = self.first(item['skills'].xpath('td[3]/span[5]/text()').extract())
- piece['create'] = self.first(item['create'].xpath('td[2]').extract())
- piece['material_id_1'] = self.first(item['create'].xpath('td[2]/a[1]/@href').extract()).replace('/ida/', '').replace('.html', '')
- piece['count_1'] = self.count(item['create'].xpath('(td[2]//text()[normalize-space()])[2]').extract())
- piece['material_id_2'] = self.first(item['create'].xpath('td[2]/a[2]/@href').extract()).replace('/ida/', '').replace('.html', '')
- piece['count_2'] = self.count(item['create'].xpath('(td[2]//text()[normalize-space()])[4]').extract())
- piece['material_id_3'] = self.first(item['create'].xpath('td[2]/a[3]/@href').extract()).replace('/ida/', '').replace('.html', '')
- piece['count_3'] = self.count(item['create'].xpath('(td[2]//text()[normalize-space()])[6]').extract())
- piece['material_id_4'] = self.first(item['create'].xpath('td[2]/a[4]/@href').extract()).replace('/ida/', '').replace('.html', '')
- piece['count_4'] = self.count(item['create'].xpath('(td[2]//text()[normalize-space()])[8]').extract())
- count = count + 1
- yield piece
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement