Advertisement
Guest User

may 25th

a guest
May 28th, 2018
97
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 5.51 KB | None | 0 0
  1. # -*- coding: utf-8 -*-
  2. import scrapy
  3. import sys
  4. import csv
  5. from utils import extract_texts_between_tags, get_text_containing
  6.  
  7. if sys.version_info[0] == 3:
  8. from urllib.request import urlopen
  9. else:
  10. # Not Python 3 - today, it is most likely to be Python 2
  11. # But note that this might need an update when Python 4
  12. # might be around one day
  13. from urllib import urlopen
  14.  
  15.  
  16.  
  17.  
  18. def extract_texts_between_tags(
  19. response,
  20. html_tag):
  21. """
  22. Extracts texts between html tag
  23. :param response: response from scrapy
  24. :param html_tag: html tag to extract text from
  25. :output: list of phrases
  26. """
  27. xpath = "//" + html_tag + "/text()"
  28. texts = response.xpath(xpath).extract()
  29. texts_to_return = []
  30. for text in texts:
  31. # strip text
  32. text = text.strip()
  33. if text:
  34. texts_to_return.append(text)
  35. return texts_to_return
  36.  
  37.  
  38. def get_text_containing(response, pattern):
  39. """
  40. Pattern is injected into xpath. Eg if pattern was "?",
  41. we would look for all text containing "?".
  42. Pattern must be wrapped by double-quotes, not single-quotes
  43. """
  44. phrases = []
  45. for phrase in response.xpath('//p[contains(text(),"%s")]' % pattern ).extract():
  46. phrases.append(phrase)
  47. return phrases
  48.  
  49.  
  50.  
  51.  
  52.  
  53.  
  54.  
  55.  
  56. class WysdomCrawlerSpider(scrapy.Spider):
  57. name = 'wysdom_crawler'
  58. # TODO: we want this to be configurable
  59. MAX_DEPTH = 2
  60. file_name = "test_file_name"
  61. csv_headers = ['Phrase', 'Referral URL', 'Link out URL']
  62.  
  63. def __init__(self, category=None, *args, **kwargs):
  64. super(WysdomCrawlerSpider, self).__init__(*args, **kwargs)
  65. self.is_csv = False
  66. if self.is_csv:
  67. self.create_csv_file(self.csv_headers)
  68. # TODO add more tags
  69. self.tags = ['p', 'h1', 'h2', 'h3', 'h4', 'th', 'li']
  70. self.delimiter = ","
  71. else:
  72. self.tags = ['p', 'h1', 'h2', 'div', 'li']
  73. self.delimiter = " "
  74.  
  75. self.start_urls = ['https://www.td.com/ca/en/personal-banking/how-to/']
  76.  
  77. def append_text_to_file(self, file_name, parsed_text):
  78. """
  79. Appends text to file
  80. :param file_name: string name of .txt file
  81. :param parse_text:
  82. """
  83. with open('file_name' + '.txt', 'a') as f:
  84. f.write(parsed_text.encode("utf-8"))
  85.  
  86. def create_csv_file(self, csv_headers):
  87. """
  88. Create new CSV file with headers in init
  89. :param headers: list of strings for headers
  90. """
  91. with open(self.file_name + '.csv', 'wb') as csvfile:
  92. filewriter = csv.writer(csvfile,
  93. delimiter=',',
  94. quotechar='|',
  95. quoting=csv.QUOTE_MINIMAL)
  96. filewriter.writerow(self.csv_headers)
  97.  
  98.  
  99. def append_text_to_csv(self, file_name, add_row):
  100. """
  101. Appends text to csv file as spider crawls
  102. :param add_row: list of row with phrase, ref url, linkout url
  103. """
  104. with open(self.file_name + '.csv', 'a+') as csvfile:
  105. filewriter = csv.writer(csvfile,
  106. delimiter=',',
  107. quotechar='|',
  108. quoting=csv.QUOTE_MINIMAL)
  109. filewriter.writerow(add_row)
  110.  
  111.  
  112.  
  113. def get_response_links(self, response):
  114. """
  115. Grabs all links from HTML
  116. :param response: response from scrapy
  117. """
  118. result = []
  119. for link in response.xpath('//a[@href]/@href').extract():
  120. link = link.strip()
  121. if link == "":
  122. continue
  123. if not (link.startswith("http://") or link.startswith("https://")):
  124. continue
  125. result.append(link)
  126. return result
  127.  
  128. def start_requests(self):
  129. """
  130. Generates initial request for spider
  131. """
  132. headers = {
  133. 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:48.0) Gecko/20100101 Firefox/48.0'}
  134. for url in self.start_urls:
  135. yield scrapy.Request(url, headers=headers, callback=self.parse)
  136.  
  137.  
  138. def parse(self, response):
  139. """
  140. Recursively crawls 3 layers deep by handling requests for URLs
  141. :param response: response from scrapy
  142. """
  143. # {"p": ["<p>sup</p>", "<p>hi</p>"]}
  144.  
  145. for tag in self.tags:
  146. extracted_text = extract_texts_between_tags(response, tag)
  147. print extracted_text
  148.  
  149. if self.is_csv:
  150. # phrase, referral url, linkout url
  151. add_row = [extracted_text, response.request.url, response.url ]
  152.  
  153. self.append_text_to_csv(self.file_name, add_row)
  154. else:
  155. self.append_text_to_file(self.file_name, self.delimiter.join(extracted_text))
  156.  
  157. if self.is_csv:
  158. question_phrases = get_text_containing(response, "?")
  159. for question_phrase in question_phrases:
  160. add_row = [question_phrase, response.request.url, response.url ]
  161. self.append_text_to_csv(self.file_name, add_row)
  162. # exclamation_phrases = self.get_text_containing("!")
  163.  
  164.  
  165.  
  166.  
  167. depth = response.meta["depth"]
  168. print depth
  169. if depth < self.MAX_DEPTH:
  170. for link in self.get_response_links(response):
  171. yield scrapy.Request(link, callback=self.parse)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement