Advertisement
Guest User

Untitled

a guest
May 27th, 2018
92
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 4.68 KB | None | 0 0
  1. # -*- coding: utf-8 -*-
  2. import scrapy
  3. import sys
  4. from scrapy.selector import HtmlXPathSelector
  5. from bs4 import BeautifulSoup
  6.  
  7. if sys.version_info[0] == 3:
  8. from urllib.request import urlopen
  9. else:
  10. # Not Python 3 - today, it is most likely to be Python 2
  11. # But note that this might need an update when Python 4
  12. # might be around one day
  13. from urllib import urlopen
  14.  
  15. import csv
  16.  
  17.  
  18. def extract_texts_between_tags(
  19. response,
  20. html_tag):
  21. """
  22. Extracts texts between html tag
  23. :param response: response from scrapy
  24. :param html_tag: html tag to extract text from
  25. :output: list of strings ?
  26. """
  27. xpath = "//" + html_tag + "/text()"
  28. texts = response.xpath(xpath).extract()
  29. texts_to_return = []
  30. for text in texts:
  31. # strip text
  32. text = text.strip()
  33. if text:
  34. texts_to_return.append(text)
  35. return texts_to_return
  36.  
  37.  
  38. class WysdomCrawlerSpider(scrapy.Spider):
  39. name = 'wysdom_crawler'
  40. # TODO: we want this to be configurable
  41. MAX_DEPTH = 2
  42. file_name = "test_file_name"
  43. csv_headers = ['Tag Phrase', 'Referral URL', 'Link out URL']
  44.  
  45. def __init__(self, category=None, *args, **kwargs):
  46. super(WysdomCrawlerSpider, self).__init__(*args, **kwargs)
  47. self.is_csv = False
  48. if self.is_csv:
  49. self.create_csv_file(csv_headers)
  50. # TODO add more tags
  51. self.tags = ['p', 'h1', 'h2', 'h3', 'h4', 'th', 'li']
  52. self.delimiter = ","
  53. else:
  54. self.tags = ['p', 'h1', 'h2', 'div', 'li']
  55. self.delimiter = " "
  56.  
  57. self.start_urls = ['https://www.td.com/ca/en/personal-banking/how-to/']
  58.  
  59. def append_text_to_file(self, file_name, parsed_text):
  60. """
  61. Appends text to file
  62. :param file_name: string name of .txt file
  63. :param parse_text:
  64. """
  65. with open('file_name' + '.txt', 'a') as f:
  66. f.write(parsed_text.encode("utf-8"))
  67.  
  68. def create_csv_file(self, csv_headers):
  69. """
  70. Create new CSV file with headers in init
  71. :param headers: list of strings for headers
  72. """
  73. with open(self.file_name + '.csv', 'wb') as csvfile:
  74. filewriter = csv.writer(csvfile,
  75. delimiter=',',
  76. quotechar='|',
  77. quoting=csv.QUOTE_MINIMAL)
  78. filewriter.writerow(csv_headers)
  79.  
  80.  
  81.  
  82. def append_text_to_csv(self, add_row):
  83. """
  84. Appends text to csv file as spider crawls
  85. :param add_row: list of row with phrase, referral url and linkout url
  86. """
  87. with open(self.file_name + '.csv', 'wb') as csvfile:
  88. filewriter.writerow(add_row)
  89.  
  90.  
  91.  
  92. def get_response_links(self, response):
  93. """
  94. Grabs all links from HTML
  95. :param response: response from scrapy
  96. """
  97. result = []
  98. for link in response.xpath('//a[@href]/@href').extract():
  99. link = link.strip()
  100. if link == "":
  101. continue
  102. if not (link.startswith("http://") or link.startswith("https://")):
  103. continue
  104. result.append(link)
  105. return result
  106.  
  107.  
  108. def start_requests(self):
  109. """
  110. Generates initial request for spider
  111. """
  112. headers = {
  113. 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:48.0) Gecko/20100101 Firefox/48.0'}
  114. for url in self.start_urls:
  115. yield scrapy.Request(url, headers=headers, callback=self.parse)
  116.  
  117.  
  118. def parse(self, response):
  119. """
  120. Recursively crawls 3 layers deep by handling requests for URLs
  121. :param response: response from scrapy
  122. """
  123. #time.sleep(5)
  124.  
  125. # {"p": ["<p>sup</p>", "<p>hi</p>"]}
  126. parsed_text = {}
  127. for tag in self.tags:
  128. extracted_text = extract_texts_between_tags(response, tag)
  129. parsed_text[tag] = extracted_text
  130. print extracted_text
  131.  
  132. if self.is_csv:
  133. # phrase, referral url, linkout url
  134. add_row = [extracted_text , scrapy.Response.url, scrapy.Response.request.url]
  135. self.append_text_to_csv(self.file_name, extracted_text, tag_headers)
  136. else:
  137. self.append_text_to_file(self.file_name, self.delimiter.join(extracted_text))
  138.  
  139. print parsed_text
  140. depth = response.meta["depth"]
  141. print depth
  142. if depth < self.MAX_DEPTH:
  143. for link in self.get_response_links(response):
  144. yield scrapy.Request(link, callback=self.parse)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement