Advertisement
Guest User

Untitled

a guest
Feb 2nd, 2012
243
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.81 KB | None | 0 0
  1. # -*- coding: utf-8 -*-
  2.  
  3. from scrapy.xpath import HtmlXPathSelector
  4. from scrapy.link.extractors import RegexLinkExtractor
  5. from scrapy.contrib.spiders import CrawlSpider, Rule
  6. from scrapy.contrib_exp import adaptors
  7. from eventscraper.items import EventscraperItem
  8. from scrapy import log
  9. from eventscraper import util
  10. from string import join as j
  11. from datetime import datetime, timedelta
  12. import re
  13.  
  14.  
  15. DATE_REGEX = re.compile(r'(\d+)\.(%s)\.(\d){,2}.*?([0-1]\d|2[0-4]):([0-5]\d)hs' % j([m[:3] for m in util.MONTHS], '|'), re.IGNORECASE)
  16.  
  17. def _fecha_adaptor(value):
  18. m = DATE_REGEX.search(value)
  19. if m is None:
  20. raise Exception("Couldn't parse: %s" % value)
  21. g = m.groups()
  22. d = datetime(2000 + int(g[2]),
  23. [m[:3] for m in util.MONTHS].index(g[1].lower()) + 1,
  24. int(g[0]))
  25. if int(g[3]) == 24: # 24 no es valido como hora, sumamos 1 dia a la fecha y seteamos hora a 0
  26. d = d + timedelta(1)
  27. d = datetime(d.year, d.month, d.day, 0, int(g[4]))
  28. else:
  29. d = datetime(d.year, d.month, d.day, int(g[3]), int(g[4]))
  30.  
  31. return d
  32.  
  33.  
  34.  
  35. class ProyectoUnderSpider(CrawlSpider):
  36. domain_name = 'proyectounder.com'
  37. start_urls = ['http://www.proyectounder.com/agenda.php?agenda=1']
  38.  
  39. rules = (
  40. Rule(RegexLinkExtractor(allow=(r'/evento/\d+',)), 'parse_event', follow=True),
  41. Rule(RegexLinkExtractor(allow=(r'/agenda.php\?action=display',)))
  42. )
  43.  
  44. adaptor_pipe = [adaptors.extract, adaptors.delist(''), adaptors.strip]
  45.  
  46. adaptor_map = {
  47. 'summary': adaptor_pipe,
  48. 'description': adaptor_pipe,
  49. 'start_date': [adaptors.extract, adaptors.delist(''), _fecha_adaptor]
  50. }
  51.  
  52.  
  53. def parse_event(self, response):
  54. i = EventscraperItem()
  55. i.set_adaptors(self.adaptor_map)
  56.  
  57. xs = HtmlXPathSelector(response)
  58.  
  59. i.attribute('guid', response.url)
  60. i.attribute('url', response.url)
  61. i.attribute('summary', xs.x('//div[contains(@class, "eventoItem")]/h1/text()'))
  62. i.attribute('description', xs.x('//div[contains(@class, "eventoItem")]/p/text()'))
  63. i.attribute('start_date', xs.x('//div[contains(@class, "eventoItem")]/span[contains(@class, "fecha")][1]/text()'))
  64.  
  65. venue = {
  66. 'name': xs.x('//div[contains(@class, "agendaVermas")]//a[contains(@href, "agenda.php?lugar=")][1]/text()').extract()[0].strip(),
  67. 'city': xs.x('//div[contains(@class, "agendaVermas")]//a[contains(@href, "agenda.php?ciudad=")][1]/text()').extract()[0].strip()
  68. }
  69.  
  70. log.msg(venue)
  71.  
  72. i.attribute('meta', {'source_site': 'proyectounder.com',
  73. 'scraped_on': datetime.now(),
  74. 'source_html': response.body })
  75.  
  76. return [i]
  77.  
  78. SPIDER = ProyectoUnderSpider()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement