Advertisement
Guest User

gg_spider.py

a guest
Oct 14th, 2017
85
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 4.18 KB | None | 0 0
  1. # -*- coding: utf-8 -*-
  2. import scrapy
  3. from datetime import datetime, timedelta
  4. import re
  5.  
  6. class GgSpiderSpider(scrapy.Spider):
  7.     name = 'gg_spider'
  8.     allowed_domains = ['www.gog.com']
  9.     start_urls = ['https://www.gog.com/forum/general/forum_mafia_51_the_greater_good/']
  10.        
  11.     def postparse(self, response):
  12.         for post in response.css("div.big_post_main"):
  13.             postnr = post.css("div.post_nr a::text").extract_first()
  14.             author =  post.css("div.b_u_name::text").extract_first()
  15.             post_date = post.css("div.post_date::text").extract_first().strip("Posted ")
  16.             if "now" in post_date:
  17.                 date_iso = datetime.today()
  18.             elif "minutes" in post_date:
  19.                 date_iso = datetime.today() - timedelta(minutes=[int(s) for s in post_date.split() if s.isdigit()][0])
  20.             elif "hour" in post_date:
  21.                 date_iso = datetime.today() - timedelta(hours=[int(s) for s in post_date.split() if s.isdigit()][0])
  22.             elif "Yesterday" in post_date:
  23.                 date_iso = datetime.today() - timedelta(days=1)
  24.             elif "days" in post_date:
  25.                 date_iso = datetime.today() - timedelta(days=[int(s) for s in post_date.split() if s.isdigit()][0])
  26.             else:
  27.                 date_iso = datetime.strptime(post_date, '%B %d, %Y')
  28.  
  29.             posttext_complete_string = post.css("div.post_text_c").xpath("string(normalize-space(.))").extract()
  30.             posttext_complete_string = '\n'.join(posttext_complete_string)
  31.             posttext_links = []
  32.             posttext_links = post.css("div.post_text_c a::attr(href)").extract()
  33.            
  34.             #get bolded text only in post_text_c . ignore rest ie quotes etc
  35.             bolded_text = post.xpath(".//div[@class='post_text_c']/span[@class='bold']").xpath("string()").extract()
  36.             post_vote = False
  37.             post_unvote = False
  38.             for bt in bolded_text:
  39.                 if len(bt.split()) == 2:
  40.                     prob_votes = bt.split()
  41.                     if prob_votes[0].lower() == "vote":
  42.                         post_vote = prob_votes[1]
  43.                     elif prob_votes[0].lower() == "unvote":
  44.                         post_unvote = True
  45.  
  46.             quotedauthor = post.css("span.quot_user_name::text").extract()
  47.             quotedauthor[:] = [quotauth.replace(': ', '') for quotauth in quotedauthor]
  48.  
  49.             quotedtext = post.css("div.quot.quot_text.normal_color").xpath("string(normalize-space(.))").extract()
  50.             quotedpostnr = post.xpath("//a[@class='link_arrow']/@href").extract()
  51.             quotedpostnr[:] = [re.findall(r'\d+', nr)[-1] for nr in quotedpostnr]
  52.             quotedtext_converted = []
  53.             quotedpostnr_unique = []
  54.             quotedtext_unique = []
  55.             for nr, txt in set(zip(quotedpostnr, quotedtext)):
  56.                 quotedpostnr_unique.append(nr)
  57.                 quotedtext_unique.append(txt)
  58.                 quotedtext_converted.append('[quote_' + nr + '] ' + txt + '[/quote] \n')
  59.  
  60.             posttext_noquote = posttext_complete_string
  61.             posttext_converted = posttext_complete_string
  62.             for idx, quote in enumerate(quotedtext_unique):
  63.                 if quote in posttext_noquote:
  64.                     posttext_noquote = posttext_noquote.replace(quote, '')
  65.                     posttext_converted = posttext_converted.replace(quote, quotedtext_converted[idx])
  66.  
  67.             #This is our output: (-o something.json)
  68.             yield {'postnr' : postnr, 'author' : author, 'date' : date_iso, 'vote' : post_vote, 'unvote' : post_unvote, 'posttext_complete' : posttext_converted, 'posttext_links' : posttext_links, 'quotedauthor' : quotedauthor, 'quotedtext' : quotedtext_converted, 'posttext_noquote' : posttext_noquote, 'post_length' : len(posttext_noquote)}
  69.  
  70.     def parse(self, response):
  71.         start_urls = self.start_urls
  72.         num_pages = int(response.css('div.n_b_b_nrs_h.n_b_b_nr_last a.n_b_b_nr::text').extract_first())
  73.         url_list = ["{}page{}".format(start_urls[0], str(page)) for page in range(1,num_pages+1)]
  74.         for url in url_list:
  75.             yield scrapy.Request(url=url, callback=self.postparse)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement