Advertisement
Guest User

Untitled

a guest
Feb 14th, 2016
52
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.22 KB | None | 0 0
  1. # -*- encoding: utf-8 -*-
  2. from scrapy.contrib.spiders import CrawlSpider, Rule
  3. from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
  4. from scrapy.contrib.loader.processor import TakeFirst
  5. from scrapy.contrib.loader import XPathItemLoader
  6. from scrapy.selector import HtmlXPathSelector
  7. from scrapy import signals
  8. from scrapy.exporters import XmlItemExporter
  9. import io,os,re
  10.  
  11. globalvar = 0
  12. global site
  13. class MyLoader(XPathItemLoader):
  14. default_output_processor = TakeFirst()
  15.  
  16. class MySpider(CrawlSpider):
  17. site = "en.wikipedia.org"
  18. allowed_domains = [site]
  19. start_urls = ["http://"+site]
  20. rules = [Rule(SgmlLinkExtractor(), callback='parse_item', follow=True)]
  21. name = "myspider"
  22. site = site.replace('.','_')
  23. os.mkdir(site)
  24. os.chdir(site)
  25. def parse(self, response):
  26. hxs = HtmlXPathSelector(response)
  27. global globalvar
  28. while globalvar < 6:
  29. site = hxs.select("//body")
  30. f1 = open("{}.txt".format(globalvar),'wb')
  31. plain_text= ''.join(site.select("//body//text()").extract()).strip()
  32. plain_text = re.sub(r's+', ' ', plain_text)
  33. f1.write(plain_text.encode('utf8'))
  34. globalvar = globalvar +1
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement