Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import csv
- from scrapy.contrib.spiders import CrawlSpider
- from scrapy.selector import Selector
- from scrapy.contrib.linkextractors import LinkExtractor
- from scrapy.http import FormRequest, Request
- from etsbot.items import TransactionItem
- from etsbot.middlewares import RandomProxy
- class EuetsbotdetSpider(CrawlSpider):
- name = 'euetsbotdet'
- allowed_domains = ['ec.europa.eu']
- start_urls = [
- 'http://ec.europa.eu/environment/ets/transaction.do'
- ]
- def parse(self, response):
- #self.data = csv.DictReader(open('/home/...t/items.csv','r'))
- #self.tids = []
- #for self.row in self.data:
- # self.tids.append(self.row['transactionID'])
- self.tids = ['DE101096','AT231']
- for self.id in self.tids:
- return FormRequest.from_response(
- response,
- formname='transactions_maxlength',
- formdata={'transactionID':self.id},
- clickdata={'name': 'search'},callback=self.parseLinks
- )
- def parseLinks(self,response):
- lex = LinkExtractor(allow=('http://ec.europa.eu/environment/ets/singleTransaction.do',),unique=True)
- for l in lex.extract_links(response):
- yield Request(l.url,method='GET',callback=self.parseDetail,)
- def parseDetail(self,response):
- sel = Selector(response)
- item = TransactionItem()
- item['transactionID'] = sel.xpath('//table/tr/td/input[@name="transactionID"]/@value').extract()
- item['transactionDate'] = sel.xpath('//table/tr/td/input[@name="transactionDate"]/@value').extract()
- lext = LinkExtractor(unique=True,restrict_xpaths = ('//*[@id="tblTransactionBlocksInformation"]/tr/td[6]/a[@class="resultlink"]'),)
- for l in lext.extract_links(response):
- yield Request(l.url,method='GET',meta={'item':item},callback=self.parseAccounttr)
- def parseAccounttr(self,response):
- sel = Selector(response)
- item = response.meta['item']
- item['tra_id'] = sel.xpath('//*[@id="tblAccountInfoReadonly"]/tr/td/input[@name="identifierInReg"]/@value').extract()
- lexa = LinkExtractor(unique=True,restrict_xpaths = ('//*[@id="tblTransactionBlocksInformation"]/tr/td[7]/a[@class="resultlink"]'),)
- for l in lexa.extract_links(response):
- yield Request(l.url,method='GET',meta={'item':item},callback=self.parseAccountac)
- def parseAccountac(self,response):
- sel = Selector(response)
- item = response.meta['item']
- item['acq_id'] = sel.xpath('//*[@id="tblAccountInfoReadonly"]/tr/td/input[@name="identifierInReg"]/@value').extract()
- yield item
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement