Advertisement
Guest User

pexcrawler

a guest
Nov 4th, 2017
142
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 4.09 KB | None | 0 0
  1. # -*- coding: utf-8 -*-
  2. import scrapy
  3. from scrapy.spiders import CrawlSpider, Rule
  4. from scrapy.linkextractors import LinkExtractor
  5. from PExCrawler.items import PexcrawlerItem
  6. import datetime
  7. import urllib.parse
  8. import sort_csv
  9.  
  10. class PexcrawlerSpider(CrawlSpider):
  11.     name = 'pexcrawler'
  12.     allowed_domains = ['pinoyexchange.com']
  13.     start_urls = ['http://www.pinoyexchange.com/forums/forumdisplay.php?f=53']
  14.  
  15.     custom_settings = {
  16.         'FEED_EXPORT_FIELDS': ['subforum', 'thread_title', 'post_subject', 'post_counter', 'post_content', 'post_time',
  17.                                'username', 'user_posts_per_day', 'user_total_posts', 'quoted_post', 'quoted_username',
  18.                                'img_urls', 'embed_urls', 'other_urls'],
  19.     }
  20.  
  21.     rules = (
  22.         Rule(
  23.             LinkExtractor(
  24.                 allow=('showthread\.php\?t=\d+'),
  25.                 restrict_xpaths=(
  26.                     ['.//a[starts-with(@id, "thread_title")]',
  27.                      './/a[@rel="next"]']
  28.                 ),
  29.             ),
  30.             callback='parse_item',
  31.             follow=True
  32.         ),
  33.     )
  34.  
  35.     def has_smilies(self, img_link):
  36.         if "images/smilies/" in img_link:
  37.             return True
  38.         return False
  39.  
  40.     def string_to_delta(self, string_delta):
  41.         value, unit, _ = string_delta.split()
  42.         unit_list = ['hour', 'day', 'week']
  43.         if unit in unit_list:
  44.             unit += 's'
  45.         return (datetime.datetime.now() - datetime.timedelta(**{unit: float(value)})).strftime("%b %d, %Y")
  46.  
  47.     def parse_profile(self, response):
  48.         item = response.meta['item']
  49.         selector = response.css('div#view-stats')
  50.         item['user_total_posts'] = selector.xpath('./div[2]/dl[1]/dd/text()').extract()
  51.         item['user_posts_per_day'] = selector.xpath('./div[2]/dl[2]/dd/text()').extract()
  52.         return item
  53.  
  54.     def parse_item(self, response):
  55.         selector_list = response.css('li.postcontainer')
  56.  
  57.         for selector in selector_list:
  58.             item = PexcrawlerItem()
  59.             item['subforum'] = response.css('ul.floatcontainer').xpath('./li[4]/a/text()').extract()
  60.             item['thread_title'] = [i.strip() for i in response.css('li.lastnavbit').xpath('./h1/text()').extract()]
  61.             item['username'] = selector.xpath('normalize-space(./div[2]/div/div/div/a/strong//text())').extract()
  62.             item['post_time'] = [self.string_to_delta(s) if "ago" in s else s for s in selector.xpath('./div/span/span/text()').extract()]
  63.  
  64.             x = [i.strip() for i in selector.xpath('./div[2]/div[2]/div/div/div/blockquote/descendant::text()[not(ancestor::div/@class="bbcode_container")]').extract()]
  65.             item['post_content'] = ' '.join(list(filter(None, x)))
  66.             item['post_counter'] = selector.xpath('./div/span[2]/a[2]/@name').extract()
  67.  
  68.             y = [i.strip() for i in selector.xpath('./div[2]/div[2]/div/div/div/blockquote/div/div/div/div[3]/descendant::text()').extract()]
  69.             item['quoted_post'] = ' '.join(list(filter(None, y)))
  70.             item['quoted_username'] = selector.xpath('./div[2]/div[2]/div/div/div/blockquote/div/div/div/div[2]/strong//text()').extract()
  71.             z = [i.strip() for i in selector.xpath('./div[2]/div[2]/div/h2/text()').extract()]
  72.             item['post_subject'] = ' '.join(list(filter(None, z)))
  73.             # filter smilies
  74.             item['img_urls'] = [url for url in selector.xpath('./div[2]/div[2]/div/div/div/blockquote/img/@src').extract() if not self.has_smilies(url)]
  75.             item['other_urls'] = selector.xpath('./div[2]/div[2]/div/div/div/blockquote/a/@href').extract()
  76.             item['embed_urls'] = selector.xpath('./div[2]/div[2]/div/div/div/blockquote/iframe/@src').extract()
  77.  
  78.             member_url = urllib.parse.urljoin(response.url, selector.xpath('./div[2]/div/div/div/a/@href').extract_first())
  79.             request = scrapy.Request(member_url, meta={'item': item}, callback=self.parse_profile, dont_filter=True)
  80.             request.meta['item'] = item
  81.  
  82.             yield request
  83.  
  84.     def closed(self, reason):
  85.         sort_csv.sort_data()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement