Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- class exhibitors_spider(scrapy.Spider):
- name = "exhibitors"
- url = "some url"
- def _create_item_class(self, class_name, field_list):
- field_dict = {}
- for field_name in field_list:
- field_dict[field_name] = Field()
- return type(str(class_name), (DictItem,), {'fields': field_dict})
- def start_requests(self):
- yield Request(url=self.url, callback=self.parse_page, dont_filter=True)
- def parse_page(self, response):
- Contact_Persons = {}
- Contact_Persons_blocks = response.selector.xpath("//h2[contains(text(),'Contact person')]/..//..//div/.//li")
- if Contact_Persons_blocks:
- for i in xrange(1, len(Contact_Persons_blocks) + 1):
- cp_name = Contact_Persons_blocks[i - 1].xpath(".//a[@itemprop='name']/bdi/text()").extract_first()
- if cp_name:
- cp_name = capwords(cp_name.encode('utf-8'))
- else:
- cp_name = 0
- Contact_Persons.update({"Contact_Person_Name_{}".format(i): cp_name})
- cp_title = Contact_Persons_blocks[i - 1].xpath(".//div[@itemprop='jobTitle']/text()").extract_first()
- if cp_title:
- cp_title = capwords(cp_title.encode('utf-8'))
- else:
- cp_title = 0
- Contact_Persons.update({"Contact_Person_Title_{}".format(i): cp_title})
- cp_link = Contact_Persons_blocks[i - 1].xpath(".//a[@class='ngn-mail-link']/@href").extract_first()
- if cp_link:
- cp_link = self.domain + cp_link
- else:
- cp_link = 0
- Contact_Persons.update({"Contact_Person_Link{}".format(i): cp_link})
- ExhibitorsItem = self._create_item_class('ExhibitorsItem', Contact_Persons.keys())
- for cp_key in Contact_Persons.keys():
- item[cp_key] = Contact_Persons[cp_key]
- yield item
Add Comment
Please, Sign In to add comment