Advertisement
Guest User

Untitled

a guest
Oct 3rd, 2016
118
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 9.07 KB | None | 0 0
  1. #!/usr/bin/env python
  2.  
  3. from scrapy.spiders import CrawlSpider
  4. from scrapy.http import FormRequest
  5. from scrapy.http.request import Request
  6. from loginform import fill_login_form
  7. import logging
  8.  
  9. logger = logging.getLogger('Zauba')
  10.  
  11. class zauba(CrawlSpider):
  12. name = 'Zauba'
  13. login_url = 'https://www.zauba.com/user'
  14. login_user = 'scrapybot1@gmail.com'
  15. login_password = 'scrapybot1'
  16. logger.info('zauba')
  17. start_urls = ['https://www.zauba.com/import-gold/p-1-hs-code.html']
  18.  
  19. def start_requests(self):
  20. logger.info('start_request')
  21. # let's start by sending a first request to login page
  22. yield scrapy.Request(self.login_url, callback = self.parse_login)
  23.  
  24. def parse_login(self, response):
  25. logger.warning('parse_login')
  26. # got the login page, let's fill the login form...
  27. data, url, method = fill_login_form(response.url, response.body,
  28. self.login_user, self.login_password)
  29.  
  30. # ... and send a request with our login data
  31. return FormRequest(url, formdata=dict(data),
  32. method=method, callback=self.start_crawl)
  33.  
  34. def start_crawl(self, response):
  35. logger.warning('start_crawl')
  36. # OK, we're in, let's start crawling the protected pages
  37. for url in self.start_urls:
  38. yield scrapy.Request(url, callback=self.parse)
  39.  
  40. def parse(self, response):
  41. logger.info('parse')
  42. text = response.xpath('//div[@id="block-system-main"]/div[@class="content"]/div[@style="width:920px; margin-bottom:12px;"]/span/text()').extract_first()
  43. total_entries = int(text.split()[0].replace(',', ''))
  44. total_pages = int(math.ceil((total_entries*1.0)/30))
  45. logger.warning('*************** : ' + total_pages)
  46. print('*************** : ' + total_pages)
  47. for page in xrange(1, (total_pages + 1)):
  48. url = 'https://www.zauba.com/import-gold/p-' + page +'-hs-code.html'
  49. log.msg('url%d : %s' % (pages,url))
  50. yield scrapy.Request(url, callback=self.extract_entries)
  51.  
  52.  
  53. def extract_entries(self, response):
  54. logger.warning('extract_entries')
  55. row_trs = response.xpath('//div[@id="block-system-main"]/div[@class="content"]/div/table/tr')
  56. for row_tr in row_trs[1:]:
  57. row_content = row_tr.xpath('.//td/text()').extract()
  58. if (row_content.__len__() == 9):
  59. print row_content
  60. yield {
  61. 'date' : row_content[0].replace(' ', ''),
  62. 'hs_code' : int(row_content[1]),
  63. 'description' : row_content[2],
  64. 'origin_country' : row_content[3],
  65. 'port_of_discharge' : row_content[4],
  66. 'unit' : row_content[5],
  67. 'quantity' : int(row_content[6].replace(',', '')),
  68. 'value_inr' : int(row_content[7].replace(',', '')),
  69. 'per_unit_inr' : int(row_content[8].replace(',', '')),
  70. }
  71.  
  72. #!/usr/bin/env python
  73.  
  74. import sys
  75. from argparse import ArgumentParser
  76. from collections import defaultdict
  77. from lxml import html
  78.  
  79. __version__ = '1.0' # also update setup.py
  80.  
  81.  
  82. def _form_score(form):
  83. score = 0
  84.  
  85. # In case of user/pass or user/pass/remember-me
  86.  
  87. if len(form.inputs.keys()) in (2, 3):
  88. score += 10
  89.  
  90. typecount = defaultdict(int)
  91. for x in form.inputs:
  92. type_ = (x.type if isinstance(x, html.InputElement) else 'other'
  93. )
  94. typecount[type_] += 1
  95.  
  96. if typecount['text'] > 1:
  97. score += 10
  98. if not typecount['text']:
  99. score -= 10
  100.  
  101. if typecount['password'] == 1:
  102. score += 10
  103. if not typecount['password']:
  104. score -= 10
  105.  
  106. if typecount['checkbox'] > 1:
  107. score -= 10
  108. if typecount['radio']:
  109. score -= 10
  110.  
  111. return score
  112.  
  113.  
  114. def _pick_form(forms):
  115. """Return the form most likely to be a login form"""
  116.  
  117. return sorted(forms, key=_form_score, reverse=True)[0]
  118.  
  119.  
  120. def _pick_fields(form):
  121. """Return the most likely field names for username and password"""
  122.  
  123. userfield = passfield = emailfield = None
  124. for x in form.inputs:
  125. if not isinstance(x, html.InputElement):
  126. continue
  127.  
  128. type_ = x.type
  129. if type_ == 'password' and passfield is None:
  130. passfield = x.name
  131. elif type_ == 'text' and userfield is None:
  132. userfield = x.name
  133. elif type_ == 'email' and emailfield is None:
  134. emailfield = x.name
  135.  
  136. return (userfield or emailfield, passfield)
  137.  
  138.  
  139. def submit_value(form):
  140. """Returns the value for the submit input, if any"""
  141.  
  142. for x in form.inputs:
  143. if x.type == 'submit' and x.name:
  144. return [(x.name, x.value)]
  145. else:
  146. return []
  147.  
  148.  
  149. def fill_login_form(
  150. url,
  151. body,
  152. username,
  153. password,
  154. ):
  155. doc = html.document_fromstring(body, base_url=url)
  156. form = _pick_form(doc.xpath('//form'))
  157. (userfield, passfield) = _pick_fields(form)
  158. form.fields[userfield] = username
  159. form.fields[passfield] = password
  160. form_values = form.form_values() + submit_value(form)
  161. return (form_values, form.action or form.base_url, form.method)
  162.  
  163.  
  164. def main():
  165. ap = ArgumentParser()
  166. ap.add_argument('-u', '--username', default='username')
  167. ap.add_argument('-p', '--password', default='secret')
  168. ap.add_argument('url')
  169. args = ap.parse_args()
  170.  
  171. try:
  172. import requests
  173. except ImportError:
  174. print 'requests library is required to use loginform as a tool'
  175.  
  176. r = requests.get(args.url)
  177. (values, action, method) = fill_login_form(args.url, r.text,
  178. args.username, args.password)
  179. print '''url: {0}
  180. method: {1}
  181. payload:'''.format(action, method)
  182. for (k, v) in values:
  183. print '- {0}: {1}'.format(k, v)
  184.  
  185.  
  186. if __name__ == '__main__':
  187. sys.exit(main())
  188.  
  189. 2016-10-02 23:31:28 [scrapy] INFO: Scrapy 1.1.3 started (bot: scraptest)
  190. 2016-10-02 23:31:28 [scrapy] INFO: Overridden settings: {'NEWSPIDER_MODULE': 'scraptest.spiders', 'FEED_URI': 'medic.json', 'SPIDER_MODULES': ['scraptest.spiders'], 'BOT_NAME': 'scraptest', 'ROBOTSTXT_OBEY': True, 'USER_AGENT': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:39.0) Gecko/20100101 Firefox/39.0', 'FEED_FORMAT': 'json', 'AUTOTHROTTLE_ENABLED': True}
  191. 2016-10-02 23:31:28 [scrapy] INFO: Enabled extensions:
  192. ['scrapy.extensions.feedexport.FeedExporter',
  193. 'scrapy.extensions.logstats.LogStats',
  194. 'scrapy.extensions.telnet.TelnetConsole',
  195. 'scrapy.extensions.corestats.CoreStats',
  196. 'scrapy.extensions.throttle.AutoThrottle']
  197. 2016-10-02 23:31:28 [scrapy] INFO: Enabled downloader middlewares:
  198. ['scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware',
  199. 'scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
  200. 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
  201. 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
  202. 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
  203. 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
  204. 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
  205. 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
  206. 'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
  207. 'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',
  208. 'scrapy.downloadermiddlewares.chunked.ChunkedTransferMiddleware',
  209. 'scrapy.downloadermiddlewares.stats.DownloaderStats']
  210. 2016-10-02 23:31:28 [scrapy] INFO: Enabled spider middlewares:
  211. ['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
  212. 'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
  213. 'scrapy.spidermiddlewares.referer.RefererMiddleware',
  214. 'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
  215. 'scrapy.spidermiddlewares.depth.DepthMiddleware']
  216. 2016-10-02 23:31:28 [scrapy] INFO: Enabled item pipelines:
  217. []
  218. 2016-10-02 23:31:28 [scrapy] INFO: Spider opened
  219. 2016-10-02 23:31:28 [scrapy] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
  220. 2016-10-02 23:31:28 [scrapy] DEBUG: Telnet console listening on 127.0.0.1:6024
  221. 2016-10-02 23:31:29 [scrapy] DEBUG: Crawled (200) <GET https://www.zauba.com/robots.txt> (referer: None)
  222. 2016-10-02 23:31:38 [scrapy] DEBUG: Crawled (200) <GET https://www.zauba.com/import-gold/p-1-hs-code.html> (referer: None)
  223. 2016-10-02 23:31:38 [scrapy] INFO: Closing spider (finished)
  224. 2016-10-02 23:31:38 [scrapy] INFO: Dumping Scrapy stats:
  225. {'downloader/request_bytes': 558,
  226. 'downloader/request_count': 2,
  227. 'downloader/request_method_count/GET': 2,
  228. 'downloader/response_bytes': 136267,
  229. 'downloader/response_count': 2,
  230. 'downloader/response_status_count/200': 2,
  231. 'finish_reason': 'finished',
  232. 'finish_time': datetime.datetime(2016, 10, 3, 6, 31, 38, 560012),
  233. 'log_count/DEBUG': 3,
  234. 'log_count/INFO': 7,
  235. 'response_received_count': 2,
  236. 'scheduler/dequeued': 1,
  237. 'scheduler/dequeued/memory': 1,
  238. 'scheduler/enqueued': 1,
  239. 'scheduler/enqueued/memory': 1,
  240. 'start_time': datetime.datetime(2016, 10, 3, 6, 31, 28, 927872)}
  241. 2016-10-02 23:31:38 [scrapy] INFO: Spider closed (finished)
  242.  
  243. >>> scrapy shell "https://www.zauba.com/import-gold/p-1-hs-code.html"
  244. from scrapy import FormRequest
  245. login_data={'name':'mylogin', 'pass':'mypass'})
  246. request = FormRequest.from_response(response, formdata=login_data)
  247. print(request.body)
  248. # b'form_build_id=form-Lf7bFJPTN57MZwoXykfyIV0q3wzZEQqtA5s6Ce-bl5Y&form_id=user_login_block&op=Log+in&pass=mypass&name=mylogin'
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement