gg_spider.py

# -*- coding: utf-8 -*-
import scrapy
from datetime import datetime, timedelta
import re

class GgSpiderSpider(scrapy.Spider):
    name = 'gg_spider'
    allowed_domains = ['www.gog.com']
    start_urls = ['https://www.gog.com/forum/general/forum_mafia_51_the_greater_good/']

    def postparse(self, response):
        for post in response.css("div.big_post_main"):
            postnr = post.css("div.post_nr a::text").extract_first()
            author =  post.css("div.b_u_name::text").extract_first()
            post_date = post.css("div.post_date::text").extract_first().strip("Posted ")
            if "now" in post_date:
                date_iso = datetime.today()
            elif "minutes" in post_date:
                date_iso = datetime.today() - timedelta(minutes=[int(s) for s in post_date.split() if s.isdigit()][0])
            elif "hour" in post_date:
                date_iso = datetime.today() - timedelta(hours=[int(s) for s in post_date.split() if s.isdigit()][0])
            elif "Yesterday" in post_date:
                date_iso = datetime.today() - timedelta(days=1)
            elif "days" in post_date:
                date_iso = datetime.today() - timedelta(days=[int(s) for s in post_date.split() if s.isdigit()][0])
            else:
                date_iso = datetime.strptime(post_date, '%B %d, %Y')

            posttext_complete_string = post.css("div.post_text_c").xpath("string(normalize-space(.))").extract()
            posttext_complete_string = '\n'.join(posttext_complete_string)
            posttext_links = []
            posttext_links = post.css("div.post_text_c a::attr(href)").extract()

            #get bolded text only in post_text_c . ignore rest ie quotes etc
            bolded_text = post.xpath(".//div[@class='post_text_c']/span[@class='bold']").xpath("string()").extract()
            post_vote = False
            post_unvote = False
            for bt in bolded_text:
                if len(bt.split()) == 2:
                    prob_votes = bt.split()
                    if prob_votes[0].lower() == "vote":
                        post_vote = prob_votes[1]
                    elif prob_votes[0].lower() == "unvote":
                        post_unvote = True

            quotedauthor = post.css("span.quot_user_name::text").extract()
            quotedauthor[:] = [quotauth.replace(': ', '') for quotauth in quotedauthor]

            quotedtext = post.css("div.quot.quot_text.normal_color").xpath("string(normalize-space(.))").extract()
            quotedpostnr = post.xpath("//a[@class='link_arrow']/@href").extract()
            quotedpostnr[:] = [re.findall(r'\d+', nr)[-1] for nr in quotedpostnr]
            quotedtext_converted = []
            quotedpostnr_unique = []
            quotedtext_unique = []
            for nr, txt in set(zip(quotedpostnr, quotedtext)):
                quotedpostnr_unique.append(nr)
                quotedtext_unique.append(txt)
                quotedtext_converted.append('[quote_' + nr + '] ' + txt + '[/quote] \n')

            posttext_noquote = posttext_complete_string
            posttext_converted = posttext_complete_string
            for idx, quote in enumerate(quotedtext_unique):
                if quote in posttext_noquote:
                    posttext_noquote = posttext_noquote.replace(quote, '')
                    posttext_converted = posttext_converted.replace(quote, quotedtext_converted[idx])

            #This is our output: (-o something.json)
            yield {'postnr' : postnr, 'author' : author, 'date' : date_iso, 'vote' : post_vote, 'unvote' : post_unvote, 'posttext_complete' : posttext_converted, 'posttext_links' : posttext_links, 'quotedauthor' : quotedauthor, 'quotedtext' : quotedtext_converted, 'posttext_noquote' : posttext_noquote, 'post_length' : len(posttext_noquote)}

    def parse(self, response):
        start_urls = self.start_urls
        num_pages = int(response.css('div.n_b_b_nrs_h.n_b_b_nr_last a.n_b_b_nr::text').extract_first())
        url_list = ["{}page{}".format(start_urls[0], str(page)) for page in range(1,num_pages+1)]
        for url in url_list:
            yield scrapy.Request(url=url, callback=self.postparse)