Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env python
- from scrapy.spiders import CrawlSpider
- from scrapy.http import FormRequest
- from scrapy.http.request import Request
- from loginform import fill_login_form
- import logging
- class zauba(CrawlSpider):
- name = 'Zauba'
- login_url = 'https://www.zauba.com/user'
- login_user = 'scrapybot1@gmail.com'
- login_password = 'scrapybot1'
- start_urls = ['https://www.zauba.com/import-gold/p-1-hs-code.html']
- def start_requests(self):
- logging.warning('dfjkhsdjkfbjldf')
- # let's start by sending a first request to login page
- yield scrapy.Request(self.login_url, self.parse_login)
- def parse_login(self, response):
- logging.warning('dfjkhsdjkfbjldf')
- # got the login page, let's fill the login form...
- data, url, method = fill_login_form(response.url, response.body,
- self.login_user, self.login_password)
- # ... and send a request with our login data
- return FormRequest(url, formdata=dict(data),
- method=method, callback=self.start_crawl)
- def start_crawl(self, response):
- logging.warning('dfjkhsdjkfbjldf')
- # OK, we're in, let's start crawling the protected pages
- for url in self.start_urls:
- yield scrapy.Request(url, callback=self.parse)
- def parse(self, response):
- (args, url, method) = fill_login_form(response.url,
- response.body, self.login_user, self.login_pass)
- logging.warning('sdkjvbhvbhk')
- return FormRequest(url, method=method, formdata=args,
- callback=self.getPageNumber)
- def getPageNumber(self, response):
- logging.warning('**************')
- text = response.xpath('//div[@id="block-system-main"]/div[@class="content"]/div[@style="width:920px; margin-bottom:12px;"]/span/text()').extract_first()
- total_entries = int(text.split()[0].replace(',', ''))
- total_pages = int(math.ceil((total_entries*1.0)/30))
- logging.warning('*************** : ' + total_pages)
- print('*************** : ' + total_pages)
- for page in xrange(1, (total_pages + 1)):
- url = 'https://www.zauba.com/import-gold/p-' + page +'-hs-code.html'
- log.msg('url%d : %s' % (pages,url))
- yield scrapy.Request(url, callback=self.extract_entries)
- def extract_entries(self, response):
- row_trs = response.xpath('//div[@id="block-system-main"]/div[@class="content"]/div/table/tr')
- for row_tr in row_trs[1:]:
- row_content = row_tr.xpath('.//td/text()').extract()
- if (row_content.__len__() == 9):
- print row_content
- yield {
- 'date' : row_content[0].replace(' ', ''),
- 'hs_code' : int(row_content[1]),
- 'description' : row_content[2],
- 'origin_country' : row_content[3],
- 'port_of_discharge' : row_content[4],
- 'unit' : row_content[5],
- 'quantity' : int(row_content[6].replace(',', '')),
- 'value_inr' : int(row_content[7].replace(',', '')),
- 'per_unit_inr' : int(row_content[8].replace(',', '')),
- }
- 2016-10-02 23:31:28 [scrapy] INFO: Scrapy 1.1.3 started (bot: scraptest)
- 2016-10-02 23:31:28 [scrapy] INFO: Overridden settings: {'NEWSPIDER_MODULE': 'scraptest.spiders', 'FEED_URI': 'medic.json', 'SPIDER_MODULES': ['scraptest.spiders'], 'BOT_NAME': 'scraptest', 'ROBOTSTXT_OBEY': True, 'USER_AGENT': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:39.0) Gecko/20100101 Firefox/39.0', 'FEED_FORMAT': 'json', 'AUTOTHROTTLE_ENABLED': True}
- 2016-10-02 23:31:28 [scrapy] INFO: Enabled extensions:
- ['scrapy.extensions.feedexport.FeedExporter',
- 'scrapy.extensions.logstats.LogStats',
- 'scrapy.extensions.telnet.TelnetConsole',
- 'scrapy.extensions.corestats.CoreStats',
- 'scrapy.extensions.throttle.AutoThrottle']
- 2016-10-02 23:31:28 [scrapy] INFO: Enabled downloader middlewares:
- ['scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware',
- 'scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
- 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
- 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
- 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
- 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
- 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
- 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
- 'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
- 'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',
- 'scrapy.downloadermiddlewares.chunked.ChunkedTransferMiddleware',
- 'scrapy.downloadermiddlewares.stats.DownloaderStats']
- 2016-10-02 23:31:28 [scrapy] INFO: Enabled spider middlewares:
- ['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
- 'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
- 'scrapy.spidermiddlewares.referer.RefererMiddleware',
- 'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
- 'scrapy.spidermiddlewares.depth.DepthMiddleware']
- 2016-10-02 23:31:28 [scrapy] INFO: Enabled item pipelines:
- []
- 2016-10-02 23:31:28 [scrapy] INFO: Spider opened
- 2016-10-02 23:31:28 [scrapy] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
- 2016-10-02 23:31:28 [scrapy] DEBUG: Telnet console listening on 127.0.0.1:6024
- 2016-10-02 23:31:29 [scrapy] DEBUG: Crawled (200) <GET https://www.zauba.com/robots.txt> (referer: None)
- 2016-10-02 23:31:38 [scrapy] DEBUG: Crawled (200) <GET https://www.zauba.com/import-gold/p-1-hs-code.html> (referer: None)
- 2016-10-02 23:31:38 [scrapy] INFO: Closing spider (finished)
- 2016-10-02 23:31:38 [scrapy] INFO: Dumping Scrapy stats:
- {'downloader/request_bytes': 558,
- 'downloader/request_count': 2,
- 'downloader/request_method_count/GET': 2,
- 'downloader/response_bytes': 136267,
- 'downloader/response_count': 2,
- 'downloader/response_status_count/200': 2,
- 'finish_reason': 'finished',
- 'finish_time': datetime.datetime(2016, 10, 3, 6, 31, 38, 560012),
- 'log_count/DEBUG': 3,
- 'log_count/INFO': 7,
- 'response_received_count': 2,
- 'scheduler/dequeued': 1,
- 'scheduler/dequeued/memory': 1,
- 'scheduler/enqueued': 1,
- 'scheduler/enqueued/memory': 1,
- 'start_time': datetime.datetime(2016, 10, 3, 6, 31, 28, 927872)}
- 2016-10-02 23:31:38 [scrapy] INFO: Spider closed (finished)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement