Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env python
- from scrapy.spiders import CrawlSpider
- from scrapy.http import FormRequest
- from scrapy.http.request import Request
- from loginform import fill_login_form
- import logging
- logger = logging.getLogger('Zauba')
- class zauba(CrawlSpider):
- name = 'Zauba'
- login_url = 'https://www.zauba.com/user'
- login_user = 'scrapybot1@gmail.com'
- login_password = 'scrapybot1'
- logger.info('zauba')
- start_urls = ['https://www.zauba.com/import-gold/p-1-hs-code.html']
- def start_requests(self):
- logger.info('start_request')
- # let's start by sending a first request to login page
- yield scrapy.Request(self.login_url, callback = self.parse_login)
- def parse_login(self, response):
- logger.warning('parse_login')
- # got the login page, let's fill the login form...
- data, url, method = fill_login_form(response.url, response.body,
- self.login_user, self.login_password)
- # ... and send a request with our login data
- return FormRequest(url, formdata=dict(data),
- method=method, callback=self.start_crawl)
- def start_crawl(self, response):
- logger.warning('start_crawl')
- # OK, we're in, let's start crawling the protected pages
- for url in self.start_urls:
- yield scrapy.Request(url, callback=self.parse)
- def parse(self, response):
- logger.info('parse')
- text = response.xpath('//div[@id="block-system-main"]/div[@class="content"]/div[@style="width:920px; margin-bottom:12px;"]/span/text()').extract_first()
- total_entries = int(text.split()[0].replace(',', ''))
- total_pages = int(math.ceil((total_entries*1.0)/30))
- logger.warning('*************** : ' + total_pages)
- print('*************** : ' + total_pages)
- for page in xrange(1, (total_pages + 1)):
- url = 'https://www.zauba.com/import-gold/p-' + page +'-hs-code.html'
- log.msg('url%d : %s' % (pages,url))
- yield scrapy.Request(url, callback=self.extract_entries)
- def extract_entries(self, response):
- logger.warning('extract_entries')
- row_trs = response.xpath('//div[@id="block-system-main"]/div[@class="content"]/div/table/tr')
- for row_tr in row_trs[1:]:
- row_content = row_tr.xpath('.//td/text()').extract()
- if (row_content.__len__() == 9):
- print row_content
- yield {
- 'date' : row_content[0].replace(' ', ''),
- 'hs_code' : int(row_content[1]),
- 'description' : row_content[2],
- 'origin_country' : row_content[3],
- 'port_of_discharge' : row_content[4],
- 'unit' : row_content[5],
- 'quantity' : int(row_content[6].replace(',', '')),
- 'value_inr' : int(row_content[7].replace(',', '')),
- 'per_unit_inr' : int(row_content[8].replace(',', '')),
- }
- #!/usr/bin/env python
- import sys
- from argparse import ArgumentParser
- from collections import defaultdict
- from lxml import html
- __version__ = '1.0' # also update setup.py
- def _form_score(form):
- score = 0
- # In case of user/pass or user/pass/remember-me
- if len(form.inputs.keys()) in (2, 3):
- score += 10
- typecount = defaultdict(int)
- for x in form.inputs:
- type_ = (x.type if isinstance(x, html.InputElement) else 'other'
- )
- typecount[type_] += 1
- if typecount['text'] > 1:
- score += 10
- if not typecount['text']:
- score -= 10
- if typecount['password'] == 1:
- score += 10
- if not typecount['password']:
- score -= 10
- if typecount['checkbox'] > 1:
- score -= 10
- if typecount['radio']:
- score -= 10
- return score
- def _pick_form(forms):
- """Return the form most likely to be a login form"""
- return sorted(forms, key=_form_score, reverse=True)[0]
- def _pick_fields(form):
- """Return the most likely field names for username and password"""
- userfield = passfield = emailfield = None
- for x in form.inputs:
- if not isinstance(x, html.InputElement):
- continue
- type_ = x.type
- if type_ == 'password' and passfield is None:
- passfield = x.name
- elif type_ == 'text' and userfield is None:
- userfield = x.name
- elif type_ == 'email' and emailfield is None:
- emailfield = x.name
- return (userfield or emailfield, passfield)
- def submit_value(form):
- """Returns the value for the submit input, if any"""
- for x in form.inputs:
- if x.type == 'submit' and x.name:
- return [(x.name, x.value)]
- else:
- return []
- def fill_login_form(
- url,
- body,
- username,
- password,
- ):
- doc = html.document_fromstring(body, base_url=url)
- form = _pick_form(doc.xpath('//form'))
- (userfield, passfield) = _pick_fields(form)
- form.fields[userfield] = username
- form.fields[passfield] = password
- form_values = form.form_values() + submit_value(form)
- return (form_values, form.action or form.base_url, form.method)
- def main():
- ap = ArgumentParser()
- ap.add_argument('-u', '--username', default='username')
- ap.add_argument('-p', '--password', default='secret')
- ap.add_argument('url')
- args = ap.parse_args()
- try:
- import requests
- except ImportError:
- print 'requests library is required to use loginform as a tool'
- r = requests.get(args.url)
- (values, action, method) = fill_login_form(args.url, r.text,
- args.username, args.password)
- print '''url: {0}
- method: {1}
- payload:'''.format(action, method)
- for (k, v) in values:
- print '- {0}: {1}'.format(k, v)
- if __name__ == '__main__':
- sys.exit(main())
- 2016-10-02 23:31:28 [scrapy] INFO: Scrapy 1.1.3 started (bot: scraptest)
- 2016-10-02 23:31:28 [scrapy] INFO: Overridden settings: {'NEWSPIDER_MODULE': 'scraptest.spiders', 'FEED_URI': 'medic.json', 'SPIDER_MODULES': ['scraptest.spiders'], 'BOT_NAME': 'scraptest', 'ROBOTSTXT_OBEY': True, 'USER_AGENT': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:39.0) Gecko/20100101 Firefox/39.0', 'FEED_FORMAT': 'json', 'AUTOTHROTTLE_ENABLED': True}
- 2016-10-02 23:31:28 [scrapy] INFO: Enabled extensions:
- ['scrapy.extensions.feedexport.FeedExporter',
- 'scrapy.extensions.logstats.LogStats',
- 'scrapy.extensions.telnet.TelnetConsole',
- 'scrapy.extensions.corestats.CoreStats',
- 'scrapy.extensions.throttle.AutoThrottle']
- 2016-10-02 23:31:28 [scrapy] INFO: Enabled downloader middlewares:
- ['scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware',
- 'scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
- 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
- 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
- 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
- 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
- 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
- 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
- 'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
- 'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',
- 'scrapy.downloadermiddlewares.chunked.ChunkedTransferMiddleware',
- 'scrapy.downloadermiddlewares.stats.DownloaderStats']
- 2016-10-02 23:31:28 [scrapy] INFO: Enabled spider middlewares:
- ['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
- 'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
- 'scrapy.spidermiddlewares.referer.RefererMiddleware',
- 'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
- 'scrapy.spidermiddlewares.depth.DepthMiddleware']
- 2016-10-02 23:31:28 [scrapy] INFO: Enabled item pipelines:
- []
- 2016-10-02 23:31:28 [scrapy] INFO: Spider opened
- 2016-10-02 23:31:28 [scrapy] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
- 2016-10-02 23:31:28 [scrapy] DEBUG: Telnet console listening on 127.0.0.1:6024
- 2016-10-02 23:31:29 [scrapy] DEBUG: Crawled (200) <GET https://www.zauba.com/robots.txt> (referer: None)
- 2016-10-02 23:31:38 [scrapy] DEBUG: Crawled (200) <GET https://www.zauba.com/import-gold/p-1-hs-code.html> (referer: None)
- 2016-10-02 23:31:38 [scrapy] INFO: Closing spider (finished)
- 2016-10-02 23:31:38 [scrapy] INFO: Dumping Scrapy stats:
- {'downloader/request_bytes': 558,
- 'downloader/request_count': 2,
- 'downloader/request_method_count/GET': 2,
- 'downloader/response_bytes': 136267,
- 'downloader/response_count': 2,
- 'downloader/response_status_count/200': 2,
- 'finish_reason': 'finished',
- 'finish_time': datetime.datetime(2016, 10, 3, 6, 31, 38, 560012),
- 'log_count/DEBUG': 3,
- 'log_count/INFO': 7,
- 'response_received_count': 2,
- 'scheduler/dequeued': 1,
- 'scheduler/dequeued/memory': 1,
- 'scheduler/enqueued': 1,
- 'scheduler/enqueued/memory': 1,
- 'start_time': datetime.datetime(2016, 10, 3, 6, 31, 28, 927872)}
- 2016-10-02 23:31:38 [scrapy] INFO: Spider closed (finished)
- >>> scrapy shell "https://www.zauba.com/import-gold/p-1-hs-code.html"
- from scrapy import FormRequest
- login_data={'name':'mylogin', 'pass':'mypass'})
- request = FormRequest.from_response(response, formdata=login_data)
- print(request.body)
- # b'form_build_id=form-Lf7bFJPTN57MZwoXykfyIV0q3wzZEQqtA5s6Ce-bl5Y&form_id=user_login_block&op=Log+in&pass=mypass&name=mylogin'
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement