Advertisement
Guest User

Untitled

a guest
Jan 14th, 2015
300
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.57 KB | None | 0 0
  1. import csv
  2.  
  3. from scrapy.contrib.spiders import CrawlSpider
  4. from scrapy.selector import Selector
  5. from scrapy.contrib.linkextractors import LinkExtractor
  6. from scrapy.http import FormRequest, Request
  7.  
  8. from etsbot.items import TransactionItem
  9. from etsbot.middlewares import RandomProxy
  10.  
  11. class EuetsbotdetSpider(CrawlSpider):
  12. name = 'euetsbotdet'
  13. allowed_domains = ['ec.europa.eu']
  14. start_urls = [
  15. 'http://ec.europa.eu/environment/ets/transaction.do'
  16. ]
  17.  
  18. def parse(self, response):
  19. #self.data = csv.DictReader(open('/home/...t/items.csv','r'))
  20. #self.tids = []
  21. #for self.row in self.data:
  22. # self.tids.append(self.row['transactionID'])
  23. self.tids = ['DE101096','AT231']
  24.  
  25. for self.id in self.tids:
  26. return FormRequest.from_response(
  27. response,
  28. formname='transactions_maxlength',
  29. formdata={'transactionID':self.id},
  30. clickdata={'name': 'search'},callback=self.parseLinks
  31. )
  32.  
  33. def parseLinks(self,response):
  34. lex = LinkExtractor(allow=('http://ec.europa.eu/environment/ets/singleTransaction.do',),unique=True)
  35. for l in lex.extract_links(response):
  36. yield Request(l.url,method='GET',callback=self.parseDetail,)
  37.  
  38. def parseDetail(self,response):
  39. sel = Selector(response)
  40. item = TransactionItem()
  41. item['transactionID'] = sel.xpath('//table/tr/td/input[@name="transactionID"]/@value').extract()
  42. item['transactionDate'] = sel.xpath('//table/tr/td/input[@name="transactionDate"]/@value').extract()
  43.  
  44. lext = LinkExtractor(unique=True,restrict_xpaths = ('//*[@id="tblTransactionBlocksInformation"]/tr/td[6]/a[@class="resultlink"]'),)
  45. for l in lext.extract_links(response):
  46. yield Request(l.url,method='GET',meta={'item':item},callback=self.parseAccounttr)
  47.  
  48. def parseAccounttr(self,response):
  49. sel = Selector(response)
  50. item = response.meta['item']
  51. item['tra_id'] = sel.xpath('//*[@id="tblAccountInfoReadonly"]/tr/td/input[@name="identifierInReg"]/@value').extract()
  52.  
  53. lexa = LinkExtractor(unique=True,restrict_xpaths = ('//*[@id="tblTransactionBlocksInformation"]/tr/td[7]/a[@class="resultlink"]'),)
  54. for l in lexa.extract_links(response):
  55. yield Request(l.url,method='GET',meta={'item':item},callback=self.parseAccountac)
  56.  
  57. def parseAccountac(self,response):
  58. sel = Selector(response)
  59. item = response.meta['item']
  60. item['acq_id'] = sel.xpath('//*[@id="tblAccountInfoReadonly"]/tr/td/input[@name="identifierInReg"]/@value').extract()
  61. yield item
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement