Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # -*- coding: utf-8 -*-
- import scrapy
- from datetime import datetime, timedelta
- import re
- class GgSpiderSpider(scrapy.Spider):
- name = 'gg_spider'
- allowed_domains = ['www.gog.com']
- start_urls = ['https://www.gog.com/forum/general/forum_mafia_51_the_greater_good/']
- def postparse(self, response):
- for post in response.css("div.big_post_main"):
- postnr = post.css("div.post_nr a::text").extract_first()
- author = post.css("div.b_u_name::text").extract_first()
- post_date = post.css("div.post_date::text").extract_first().strip("Posted ")
- if "now" in post_date:
- date_iso = datetime.today()
- elif "minutes" in post_date:
- date_iso = datetime.today() - timedelta(minutes=[int(s) for s in post_date.split() if s.isdigit()][0])
- elif "hour" in post_date:
- date_iso = datetime.today() - timedelta(hours=[int(s) for s in post_date.split() if s.isdigit()][0])
- elif "Yesterday" in post_date:
- date_iso = datetime.today() - timedelta(days=1)
- elif "days" in post_date:
- date_iso = datetime.today() - timedelta(days=[int(s) for s in post_date.split() if s.isdigit()][0])
- else:
- date_iso = datetime.strptime(post_date, '%B %d, %Y')
- posttext_complete_string = post.css("div.post_text_c").xpath("string(normalize-space(.))").extract()
- posttext_complete_string = '\n'.join(posttext_complete_string)
- posttext_links = []
- posttext_links = post.css("div.post_text_c a::attr(href)").extract()
- #get bolded text only in post_text_c . ignore rest ie quotes etc
- bolded_text = post.xpath(".//div[@class='post_text_c']/span[@class='bold']").xpath("string()").extract()
- post_vote = False
- post_unvote = False
- for bt in bolded_text:
- if len(bt.split()) == 2:
- prob_votes = bt.split()
- if prob_votes[0].lower() == "vote":
- post_vote = prob_votes[1]
- elif prob_votes[0].lower() == "unvote":
- post_unvote = True
- quotedauthor = post.css("span.quot_user_name::text").extract()
- quotedauthor[:] = [quotauth.replace(': ', '') for quotauth in quotedauthor]
- quotedtext = post.css("div.quot.quot_text.normal_color").xpath("string(normalize-space(.))").extract()
- quotedpostnr = post.xpath("//a[@class='link_arrow']/@href").extract()
- quotedpostnr[:] = [re.findall(r'\d+', nr)[-1] for nr in quotedpostnr]
- quotedtext_converted = []
- quotedpostnr_unique = []
- quotedtext_unique = []
- for nr, txt in set(zip(quotedpostnr, quotedtext)):
- quotedpostnr_unique.append(nr)
- quotedtext_unique.append(txt)
- quotedtext_converted.append('[quote_' + nr + '] ' + txt + '[/quote] \n')
- posttext_noquote = posttext_complete_string
- posttext_converted = posttext_complete_string
- for idx, quote in enumerate(quotedtext_unique):
- if quote in posttext_noquote:
- posttext_noquote = posttext_noquote.replace(quote, '')
- posttext_converted = posttext_converted.replace(quote, quotedtext_converted[idx])
- #This is our output: (-o something.json)
- yield {'postnr' : postnr, 'author' : author, 'date' : date_iso, 'vote' : post_vote, 'unvote' : post_unvote, 'posttext_complete' : posttext_converted, 'posttext_links' : posttext_links, 'quotedauthor' : quotedauthor, 'quotedtext' : quotedtext_converted, 'posttext_noquote' : posttext_noquote, 'post_length' : len(posttext_noquote)}
- def parse(self, response):
- start_urls = self.start_urls
- num_pages = int(response.css('div.n_b_b_nrs_h.n_b_b_nr_last a.n_b_b_nr::text').extract_first())
- url_list = ["{}page{}".format(start_urls[0], str(page)) for page in range(1,num_pages+1)]
- for url in url_list:
- yield scrapy.Request(url=url, callback=self.postparse)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement