Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import scrapy
- from scrapy.contrib.spiders import CrawlSpider, Rule
- from scrapy.http import Request
- from scrapy.contrib.linkextractors import LinkExtractor
- from scrapy.selector import Selector
- from selenium import webdriver
- import time
- class Weapon(scrapy.Item):
- depth = scrapy.Field()
- order = scrapy.Field()
- name = scrapy.Field()
- jpn_name = scrapy.Field()
- weapon_type = scrapy.Field()
- #scrapy.Spider
- #class MH4GSpider(CrawlSpider):
- class MH4GWeaponsSpider(CrawlSpider):
- name = "crawl_mh4gweaponnames"
- allowed_domains = ['monsterhunter.wikia.com']
- start_urls = ['http://monsterhunter.wikia.com/wiki/MH4U:_Great_Sword_Weapon_Tree', 'http://monsterhunter.wikia.com/wiki/MH4U:_Long_Sword_Weapon_Tree',
- 'http://monsterhunter.wikia.com/wiki/MH4U:_Sword_and_Shield_Weapon_Tree', 'http://monsterhunter.wikia.com/wiki/MH4U:_Dual_Blades_Weapon_Tree',
- 'http://monsterhunter.wikia.com/wiki/MH4U:_Hammer_Weapon_Tree', 'http://monsterhunter.wikia.com/wiki/MH4U:_Hunting_Horn_Weapon_Tree',
- 'http://monsterhunter.wikia.com/wiki/MH4U:_Lance_Weapon_Tree', 'http://monsterhunter.wikia.com/wiki/MH4U:_Gunlance_Weapon_Tree',
- 'http://monsterhunter.wikia.com/wiki/MH4U:_Switch_Axe_Weapon_Tree', 'http://monsterhunter.wikia.com/wiki/MH4U:_Charge_Blade_Weapon_Tree',
- 'http://monsterhunter.wikia.com/wiki/MH4U:_Light_Bowgun_Weapon_Tree', 'http://monsterhunter.wikia.com/wiki/MH4U:_Heavy_Bowgun_Weapon_Tree',
- 'http://monsterhunter.wikia.com/wiki/MH4U:_Bow_Weapon_Tree', 'http://monsterhunter.wikia.com/wiki/MH4U:_Insect_Glaive_Weapon_Tree']
- #start_urls = ['http://monsterhunter.wikia.com/wiki/MH4U:_Bow_Weapon_Tree']
- def __init__(self):
- CrawlSpider.__init__(self)
- #self.br = webdriver.Firefox()
- def first(self, item):
- if item:
- return u' '.join(unicode(u' '.join(item)).split())
- else:
- return u''
- def parse(self, response):
- weapontreedict = {'http://monsterhunter.wikia.com/wiki/MH4U:_Great_Sword_Weapon_Tree' : 'Great Sword',
- 'http://monsterhunter.wikia.com/wiki/MH4U:_Long_Sword_Weapon_Tree' : 'Long Sword',
- 'http://monsterhunter.wikia.com/wiki/MH4U:_Sword_and_Shield_Weapon_Tree': 'Sword and Shield',
- 'http://monsterhunter.wikia.com/wiki/MH4U:_Dual_Blades_Weapon_Tree': 'Dual Blades',
- 'http://monsterhunter.wikia.com/wiki/MH4U:_Hammer_Weapon_Tree': "Hammer",
- 'http://monsterhunter.wikia.com/wiki/MH4U:_Hunting_Horn_Weapon_Tree': 'Hunting Horn',
- 'http://monsterhunter.wikia.com/wiki/MH4U:_Lance_Weapon_Tree': 'Lance',
- 'http://monsterhunter.wikia.com/wiki/MH4U:_Gunlance_Weapon_Tree': 'Gunlance',
- 'http://monsterhunter.wikia.com/wiki/MH4U:_Switch_Axe_Weapon_Tree': "Switch Axe",
- 'http://monsterhunter.wikia.com/wiki/MH4U:_Charge_Blade_Weapon_Tree': 'Charge Blade',
- 'http://monsterhunter.wikia.com/wiki/MH4U:_Light_Bowgun_Weapon_Tree': 'Light Bowgun',
- 'http://monsterhunter.wikia.com/wiki/MH4U:_Heavy_Bowgun_Weapon_Tree': 'Heavy Bowgun',
- 'http://monsterhunter.wikia.com/wiki/MH4U:_Bow_Weapon_Tree': 'Bow',
- 'http://monsterhunter.wikia.com/wiki/MH4U:_Insect_Glaive_Weapon_Tree': 'Insect Glaive'}
- rows = response.xpath('//table[@class="wikitable hover"]//tr//td[1]')
- #del links[5:]
- i = 1
- for row in rows:
- m = row.xpath('.//a[2]//@href').extract()
- if not m:
- continue
- depth = len(row.xpath('img').extract())
- murl = 'http://monsterhunter.wikia.com' + m[0]
- request = Request(url = murl, meta = {
- 'dont_redirect': True,
- 'handle_httpstatus_list': [302]
- }, callback = self.parse_weapon)
- request.meta['weapon_name'] = row.xpath('.//a[2]//text()').extract();
- request.meta['order'] = i
- request.meta['depth0'] = depth
- request.meta['weaponurl'] = weapontreedict[response.url]
- i+= 1
- yield request
- def parse_weapon(self, response):
- weapon = Weapon()
- weapon["depth"] = response.meta['depth0']
- weapon["order"] = response.meta['order']
- weapon["weapon_type"] = response.meta['weaponurl']
- weapon["name"] = response.meta['weapon_name']
- #weapon["name"] = self.first(response.xpath('//table[@class="linetable"]//div[2]/b/text()').extract())
- weapon["jpn_name"] = self.first(response.xpath('//table[@class="linetable"]//div[2]/text()').extract()).replace('/ ', '')
- #weapon["weapon_type"] = self.first(row.xpath('td[2]//text()[1]').extract())
- yield weapon
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement