Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # -*- encoding: utf-8 -*-
- from scrapy.contrib.spiders import CrawlSpider, Rule
- from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
- from scrapy.contrib.loader.processor import TakeFirst
- from scrapy.contrib.loader import XPathItemLoader
- from scrapy.selector import HtmlXPathSelector
- from scrapy import signals
- from scrapy.exporters import XmlItemExporter
- import io,os,re
- globalvar = 0
- global site
- class MyLoader(XPathItemLoader):
- default_output_processor = TakeFirst()
- class MySpider(CrawlSpider):
- site = "en.wikipedia.org"
- allowed_domains = [site]
- start_urls = ["http://"+site]
- rules = [Rule(SgmlLinkExtractor(), callback='parse_item', follow=True)]
- name = "myspider"
- site = site.replace('.','_')
- os.mkdir(site)
- os.chdir(site)
- def parse(self, response):
- hxs = HtmlXPathSelector(response)
- global globalvar
- while globalvar < 6:
- site = hxs.select("//body")
- f1 = open("{}.txt".format(globalvar),'wb')
- plain_text= ''.join(site.select("//body//text()").extract()).strip()
- plain_text = re.sub(r's+', ' ', plain_text)
- f1.write(plain_text.encode('utf8'))
- globalvar = globalvar +1
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement