Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # -*- coding: utf-8 -*-
- from scrapy.xpath import HtmlXPathSelector
- from scrapy.link.extractors import RegexLinkExtractor
- from scrapy.contrib.spiders import CrawlSpider, Rule
- from scrapy.contrib_exp import adaptors
- from eventscraper.items import EventscraperItem
- from scrapy import log
- from eventscraper import util
- from string import join as j
- from datetime import datetime, timedelta
- import re
- DATE_REGEX = re.compile(r'(\d+)\.(%s)\.(\d){,2}.*?([0-1]\d|2[0-4]):([0-5]\d)hs' % j([m[:3] for m in util.MONTHS], '|'), re.IGNORECASE)
- def _fecha_adaptor(value):
- m = DATE_REGEX.search(value)
- if m is None:
- raise Exception("Couldn't parse: %s" % value)
- g = m.groups()
- d = datetime(2000 + int(g[2]),
- [m[:3] for m in util.MONTHS].index(g[1].lower()) + 1,
- int(g[0]))
- if int(g[3]) == 24: # 24 no es valido como hora, sumamos 1 dia a la fecha y seteamos hora a 0
- d = d + timedelta(1)
- d = datetime(d.year, d.month, d.day, 0, int(g[4]))
- else:
- d = datetime(d.year, d.month, d.day, int(g[3]), int(g[4]))
- return d
- class ProyectoUnderSpider(CrawlSpider):
- domain_name = 'proyectounder.com'
- start_urls = ['http://www.proyectounder.com/agenda.php?agenda=1']
- rules = (
- Rule(RegexLinkExtractor(allow=(r'/evento/\d+',)), 'parse_event', follow=True),
- Rule(RegexLinkExtractor(allow=(r'/agenda.php\?action=display',)))
- )
- adaptor_pipe = [adaptors.extract, adaptors.delist(''), adaptors.strip]
- adaptor_map = {
- 'summary': adaptor_pipe,
- 'description': adaptor_pipe,
- 'start_date': [adaptors.extract, adaptors.delist(''), _fecha_adaptor]
- }
- def parse_event(self, response):
- i = EventscraperItem()
- i.set_adaptors(self.adaptor_map)
- xs = HtmlXPathSelector(response)
- i.attribute('guid', response.url)
- i.attribute('url', response.url)
- i.attribute('summary', xs.x('//div[contains(@class, "eventoItem")]/h1/text()'))
- i.attribute('description', xs.x('//div[contains(@class, "eventoItem")]/p/text()'))
- i.attribute('start_date', xs.x('//div[contains(@class, "eventoItem")]/span[contains(@class, "fecha")][1]/text()'))
- venue = {
- 'name': xs.x('//div[contains(@class, "agendaVermas")]//a[contains(@href, "agenda.php?lugar=")][1]/text()').extract()[0].strip(),
- 'city': xs.x('//div[contains(@class, "agendaVermas")]//a[contains(@href, "agenda.php?ciudad=")][1]/text()').extract()[0].strip()
- }
- log.msg(venue)
- i.attribute('meta', {'source_site': 'proyectounder.com',
- 'scraped_on': datetime.now(),
- 'source_html': response.body })
- return [i]
- SPIDER = ProyectoUnderSpider()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement