Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from scrapy import Request
- from scrapy.spiders import CrawlSpider, Rule
- from scrapy.linkextractors import LinkExtractor
- from etender.items import EtenderItem
- class EtenderSpider(CrawlSpider):
- name = "etender"
- start_urls = [
- 'https://etendering.ted.europa.eu/cft/cft-search.html?'
- 'caList=&_caList=1&status=PUBLISHED&startDateFrom=01%2F05%2F2017&startDateTo=19%2F11%2F2018&'
- 'closingDateFrom=&closingDateTo=&procedureType=&_procedureType=1&confirm=Search#'
- ]
- rules = (
- Rule(LinkExtractor(allow=(), restrict_xpaths=('//*[@id="centerContent"]/div/span[3]/a[last()-1]',)),
- callback="parse_pages",
- follow=True),)
- def parse_tender_item(self, response):
- print('Processing tender: ' + response.url)
- tender = EtenderItem()
- tender['url'] = response.url
- tender['name'] = response.xpath('//*[@id="cft_titleautolinked"]/text()').get()
- tender['reference'] = response.xpath('//*[@id="cft.data.internal_name"]/text()').get()
- tender['description'] = response.xpath('//*[@id="cft_descriptionautolinked"]/text()').get()
- tender['procedureType'] = response.xpath('//*[@id="cft.data.procedure_type"]/text()').get()
- tender['status'] = response.xpath('//*[@id="cft.data.status"]/text()').get()
- tender['contractType'] = response.xpath('//*[@id="cft.data.contract_type"]/text()').get()
- yield tender
- def parse_pages(self, response):
- print('Processing...' + response.url)
- tender_links = response.css('tbody tr td:nth-child(2) a::attr(href)').extract()
- for a in tender_links:
- yield Request(a, callback=self.parse_tender_item)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement