Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # -*- coding: utf-8 -*-
- import scrapy
- import sys
- import csv
- from utils import extract_texts_between_tags, get_text_containing
- if sys.version_info[0] == 3:
- from urllib.request import urlopen
- else:
- # Not Python 3 - today, it is most likely to be Python 2
- # But note that this might need an update when Python 4
- # might be around one day
- from urllib import urlopen
- def extract_texts_between_tags(
- response,
- html_tag):
- """
- Extracts texts between html tag
- :param response: response from scrapy
- :param html_tag: html tag to extract text from
- :output: list of phrases
- """
- xpath = "//" + html_tag + "/text()"
- texts = response.xpath(xpath).extract()
- texts_to_return = []
- for text in texts:
- # strip text
- text = text.strip()
- if text:
- texts_to_return.append(text)
- return texts_to_return
- def get_text_containing(response, pattern):
- """
- Pattern is injected into xpath. Eg if pattern was "?",
- we would look for all text containing "?".
- Pattern must be wrapped by double-quotes, not single-quotes
- """
- phrases = []
- for phrase in response.xpath('//p[contains(text(),"%s")]' % pattern ).extract():
- phrases.append(phrase)
- return phrases
- class WysdomCrawlerSpider(scrapy.Spider):
- name = 'wysdom_crawler'
- # TODO: we want this to be configurable
- MAX_DEPTH = 2
- file_name = "test_file_name"
- csv_headers = ['Phrase', 'Referral URL', 'Link out URL']
- def __init__(self, category=None, *args, **kwargs):
- super(WysdomCrawlerSpider, self).__init__(*args, **kwargs)
- self.is_csv = False
- if self.is_csv:
- self.create_csv_file(self.csv_headers)
- # TODO add more tags
- self.tags = ['p', 'h1', 'h2', 'h3', 'h4', 'th', 'li']
- self.delimiter = ","
- else:
- self.tags = ['p', 'h1', 'h2', 'div', 'li']
- self.delimiter = " "
- self.start_urls = ['https://www.td.com/ca/en/personal-banking/how-to/']
- def append_text_to_file(self, file_name, parsed_text):
- """
- Appends text to file
- :param file_name: string name of .txt file
- :param parse_text:
- """
- with open('file_name' + '.txt', 'a') as f:
- f.write(parsed_text.encode("utf-8"))
- def create_csv_file(self, csv_headers):
- """
- Create new CSV file with headers in init
- :param headers: list of strings for headers
- """
- with open(self.file_name + '.csv', 'wb') as csvfile:
- filewriter = csv.writer(csvfile,
- delimiter=',',
- quotechar='|',
- quoting=csv.QUOTE_MINIMAL)
- filewriter.writerow(self.csv_headers)
- def append_text_to_csv(self, file_name, add_row):
- """
- Appends text to csv file as spider crawls
- :param add_row: list of row with phrase, ref url, linkout url
- """
- with open(self.file_name + '.csv', 'a+') as csvfile:
- filewriter = csv.writer(csvfile,
- delimiter=',',
- quotechar='|',
- quoting=csv.QUOTE_MINIMAL)
- filewriter.writerow(add_row)
- def get_response_links(self, response):
- """
- Grabs all links from HTML
- :param response: response from scrapy
- """
- result = []
- for link in response.xpath('//a[@href]/@href').extract():
- link = link.strip()
- if link == "":
- continue
- if not (link.startswith("http://") or link.startswith("https://")):
- continue
- result.append(link)
- return result
- def start_requests(self):
- """
- Generates initial request for spider
- """
- headers = {
- 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:48.0) Gecko/20100101 Firefox/48.0'}
- for url in self.start_urls:
- yield scrapy.Request(url, headers=headers, callback=self.parse)
- def parse(self, response):
- """
- Recursively crawls 3 layers deep by handling requests for URLs
- :param response: response from scrapy
- """
- # {"p": ["<p>sup</p>", "<p>hi</p>"]}
- for tag in self.tags:
- extracted_text = extract_texts_between_tags(response, tag)
- print extracted_text
- if self.is_csv:
- # phrase, referral url, linkout url
- add_row = [extracted_text, response.request.url, response.url ]
- self.append_text_to_csv(self.file_name, add_row)
- else:
- self.append_text_to_file(self.file_name, self.delimiter.join(extracted_text))
- if self.is_csv:
- question_phrases = get_text_containing(response, "?")
- for question_phrase in question_phrases:
- add_row = [question_phrase, response.request.url, response.url ]
- self.append_text_to_csv(self.file_name, add_row)
- # exclamation_phrases = self.get_text_containing("!")
- depth = response.meta["depth"]
- print depth
- if depth < self.MAX_DEPTH:
- for link in self.get_response_links(response):
- yield scrapy.Request(link, callback=self.parse)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement