Advertisement
Guest User

Untitled

a guest
Oct 3rd, 2016
168
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 6.13 KB | None | 0 0
  1. #!/usr/bin/env python
  2.  
  3. from scrapy.spiders import CrawlSpider
  4. from scrapy.http import FormRequest
  5. from scrapy.http.request import Request
  6. from loginform import fill_login_form
  7. import logging
  8.  
  9. class zauba(CrawlSpider):
  10. name = 'Zauba'
  11. login_url = 'https://www.zauba.com/user'
  12. login_user = 'scrapybot1@gmail.com'
  13. login_password = 'scrapybot1'
  14. start_urls = ['https://www.zauba.com/import-gold/p-1-hs-code.html']
  15.  
  16.  
  17.  
  18. def start_requests(self):
  19. logging.warning('dfjkhsdjkfbjldf')
  20. # let's start by sending a first request to login page
  21. yield scrapy.Request(self.login_url, self.parse_login)
  22.  
  23. def parse_login(self, response):
  24. logging.warning('dfjkhsdjkfbjldf')
  25. # got the login page, let's fill the login form...
  26. data, url, method = fill_login_form(response.url, response.body,
  27. self.login_user, self.login_password)
  28.  
  29. # ... and send a request with our login data
  30. return FormRequest(url, formdata=dict(data),
  31. method=method, callback=self.start_crawl)
  32.  
  33. def start_crawl(self, response):
  34. logging.warning('dfjkhsdjkfbjldf')
  35. # OK, we're in, let's start crawling the protected pages
  36. for url in self.start_urls:
  37. yield scrapy.Request(url, callback=self.parse)
  38.  
  39. def parse(self, response):
  40. (args, url, method) = fill_login_form(response.url,
  41. response.body, self.login_user, self.login_pass)
  42. logging.warning('sdkjvbhvbhk')
  43. return FormRequest(url, method=method, formdata=args,
  44. callback=self.getPageNumber)
  45.  
  46. def getPageNumber(self, response):
  47. logging.warning('**************')
  48. text = response.xpath('//div[@id="block-system-main"]/div[@class="content"]/div[@style="width:920px; margin-bottom:12px;"]/span/text()').extract_first()
  49. total_entries = int(text.split()[0].replace(',', ''))
  50. total_pages = int(math.ceil((total_entries*1.0)/30))
  51. logging.warning('*************** : ' + total_pages)
  52. print('*************** : ' + total_pages)
  53. for page in xrange(1, (total_pages + 1)):
  54. url = 'https://www.zauba.com/import-gold/p-' + page +'-hs-code.html'
  55. log.msg('url%d : %s' % (pages,url))
  56. yield scrapy.Request(url, callback=self.extract_entries)
  57.  
  58. def extract_entries(self, response):
  59. row_trs = response.xpath('//div[@id="block-system-main"]/div[@class="content"]/div/table/tr')
  60. for row_tr in row_trs[1:]:
  61. row_content = row_tr.xpath('.//td/text()').extract()
  62. if (row_content.__len__() == 9):
  63. print row_content
  64. yield {
  65. 'date' : row_content[0].replace(' ', ''),
  66. 'hs_code' : int(row_content[1]),
  67. 'description' : row_content[2],
  68. 'origin_country' : row_content[3],
  69. 'port_of_discharge' : row_content[4],
  70. 'unit' : row_content[5],
  71. 'quantity' : int(row_content[6].replace(',', '')),
  72. 'value_inr' : int(row_content[7].replace(',', '')),
  73. 'per_unit_inr' : int(row_content[8].replace(',', '')),
  74. }
  75.  
  76. 2016-10-02 23:31:28 [scrapy] INFO: Scrapy 1.1.3 started (bot: scraptest)
  77. 2016-10-02 23:31:28 [scrapy] INFO: Overridden settings: {'NEWSPIDER_MODULE': 'scraptest.spiders', 'FEED_URI': 'medic.json', 'SPIDER_MODULES': ['scraptest.spiders'], 'BOT_NAME': 'scraptest', 'ROBOTSTXT_OBEY': True, 'USER_AGENT': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:39.0) Gecko/20100101 Firefox/39.0', 'FEED_FORMAT': 'json', 'AUTOTHROTTLE_ENABLED': True}
  78. 2016-10-02 23:31:28 [scrapy] INFO: Enabled extensions:
  79. ['scrapy.extensions.feedexport.FeedExporter',
  80. 'scrapy.extensions.logstats.LogStats',
  81. 'scrapy.extensions.telnet.TelnetConsole',
  82. 'scrapy.extensions.corestats.CoreStats',
  83. 'scrapy.extensions.throttle.AutoThrottle']
  84. 2016-10-02 23:31:28 [scrapy] INFO: Enabled downloader middlewares:
  85. ['scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware',
  86. 'scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
  87. 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
  88. 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
  89. 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
  90. 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
  91. 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
  92. 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
  93. 'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
  94. 'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',
  95. 'scrapy.downloadermiddlewares.chunked.ChunkedTransferMiddleware',
  96. 'scrapy.downloadermiddlewares.stats.DownloaderStats']
  97. 2016-10-02 23:31:28 [scrapy] INFO: Enabled spider middlewares:
  98. ['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
  99. 'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
  100. 'scrapy.spidermiddlewares.referer.RefererMiddleware',
  101. 'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
  102. 'scrapy.spidermiddlewares.depth.DepthMiddleware']
  103. 2016-10-02 23:31:28 [scrapy] INFO: Enabled item pipelines:
  104. []
  105. 2016-10-02 23:31:28 [scrapy] INFO: Spider opened
  106. 2016-10-02 23:31:28 [scrapy] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
  107. 2016-10-02 23:31:28 [scrapy] DEBUG: Telnet console listening on 127.0.0.1:6024
  108. 2016-10-02 23:31:29 [scrapy] DEBUG: Crawled (200) <GET https://www.zauba.com/robots.txt> (referer: None)
  109. 2016-10-02 23:31:38 [scrapy] DEBUG: Crawled (200) <GET https://www.zauba.com/import-gold/p-1-hs-code.html> (referer: None)
  110. 2016-10-02 23:31:38 [scrapy] INFO: Closing spider (finished)
  111. 2016-10-02 23:31:38 [scrapy] INFO: Dumping Scrapy stats:
  112. {'downloader/request_bytes': 558,
  113. 'downloader/request_count': 2,
  114. 'downloader/request_method_count/GET': 2,
  115. 'downloader/response_bytes': 136267,
  116. 'downloader/response_count': 2,
  117. 'downloader/response_status_count/200': 2,
  118. 'finish_reason': 'finished',
  119. 'finish_time': datetime.datetime(2016, 10, 3, 6, 31, 38, 560012),
  120. 'log_count/DEBUG': 3,
  121. 'log_count/INFO': 7,
  122. 'response_received_count': 2,
  123. 'scheduler/dequeued': 1,
  124. 'scheduler/dequeued/memory': 1,
  125. 'scheduler/enqueued': 1,
  126. 'scheduler/enqueued/memory': 1,
  127. 'start_time': datetime.datetime(2016, 10, 3, 6, 31, 28, 927872)}
  128. 2016-10-02 23:31:38 [scrapy] INFO: Spider closed (finished)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement