Advertisement
Guest User

Untitled

a guest
Dec 17th, 2017
84
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 6.82 KB | None | 0 0
  1. #-*-coding:utf8;-*-
  2. #qpy:3
  3. #qpy:console
  4.  
  5. from html.parser import HTMLParser as _HTMLParser
  6. from urllib.request import urlopen
  7. import urllib.robotparser
  8. from urllib.parse import urlparse, urljoin
  9. from json import dumps as parse_json
  10. import cProfile
  11. import mimetypes
  12. import time
  13. import math
  14. import random
  15.  
  16.  
  17. mimetypes.init()
  18.  
  19. PROFILE = cProfile.Profile()
  20.  
  21.  
  22. ENABLE_PROFILING = False
  23. if ENABLE_PROFILING:
  24. PROFILE.enable()
  25.  
  26. DEFAULT_ENCODING = 'latin-1'
  27.  
  28. PARANT = '../'
  29.  
  30. def profile(func):
  31. def wrap(*args, **kw):
  32. PROFILE.enable()
  33. return func(*args, **kw)
  34. PROFILE.disable()
  35. return wrap
  36.  
  37. def average(lst):
  38. return float(sum(lst) / len(lst))
  39.  
  40. def random_wait(mini, maxi):
  41. time.sleep(random.randint(mini, maxi))
  42.  
  43. USER_AGENTS = [
  44. 'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11',
  45. 'Opera/9.25 (Windows NT 5.1; U; en)',
  46. 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
  47. 'Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)',
  48. 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12',
  49. 'Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9'
  50. ]
  51.  
  52. def random_ua():
  53. return random.choice(USER_AGENTS)
  54.  
  55. class CrawlerClient(object):
  56. def __init__(self, **kw):
  57. self.robotparser = urllib.robotparser.RobotFileParser()
  58. self.ua = random_ua()
  59. self.referer = kw.get('referer', 'www.google.com')
  60. self.not_found = set()
  61.  
  62. def can_fetch(self, url):
  63. return self.robotparser.can_fetch(self.ua, url)
  64.  
  65. def get(self, url):
  66. self.ua = random_ua()
  67. req = urllib.request.Request(url)
  68. req.add_header('User-Agent', self.ua)
  69. req.add_header('Connection', 'keep-alive')
  70. req.add_header('Accept','text/html,xhtml,xml')
  71. req.add_header('Referer', self.referer)
  72.  
  73. parsed_url = urlparse(url)
  74. robot_file_path = parsed_url.scheme + '://' + parsed_url.netloc + '/robots.txt'
  75. self.robotparser.set_url(robot_file_path)
  76. self.robotparser.read()
  77. if self.can_fetch(url):
  78. try:
  79. with urlopen(req) as res:
  80. http_headers = res.headers
  81. status_code = res.getcode()
  82.  
  83. if status_code == 404:
  84. self.not_found.add(url)
  85. return
  86.  
  87. if status_code == 500:
  88. return
  89.  
  90. content_type, *charset = http_headers.get('content-type').split(';')
  91. # Try to guess the charset.
  92. if charset:
  93. charset = charset[0].strip().split('=')[1]
  94. # Use fallback encoding.
  95. else:
  96. charset = DEFAULT_ENCODING
  97.  
  98. # If the downloaded content is of type text/*
  99. # feed the content too the parser.
  100. if content_type.split('/')[0] == 'text':
  101. return res.read().decode(charset)
  102. except Exception as e:
  103. #print(e)
  104. pass
  105.  
  106.  
  107. class HTMLParser(_HTMLParser):
  108. def __init__(self, url, strict=False):
  109. self.url = urlparse(url)
  110. self.size = 0
  111. self.client = CrawlerClient(referer='https://rickys-python-notes.blogspot.com')
  112. # If strict is True the parser will break
  113. # on broken html. Othewise it will ignore
  114. # broken html and keep on parsing.
  115. if not strict:
  116. self.error = self._do_nothing
  117.  
  118. _HTMLParser.__init__(self)
  119. # Links holds all the links that parser finds.
  120. # The parser looks for links in anchor and link tags.
  121. self.links = set()
  122. self.base_url = '{}://{}'.format(self.url.scheme, self.url.netloc)
  123. # title will hold the value of the pages title if
  124. # the page has a title
  125. self.title = None
  126. # This variable lets the handle_data method know
  127. # that we are curently reading the title data so
  128. # it can store it in self.title
  129. self.recording_title = False
  130.  
  131. html = self.client.get(url)
  132. if html:
  133. self.feed(html)
  134.  
  135. def handle_starttag(self, tag, attrs):
  136. try:
  137. key, val, *_ = attrs[0]
  138. except IndexError:
  139. key, val = 0, 0
  140.  
  141. if key == 'href':
  142.  
  143. if val.startswith('//'):
  144. val = self.url.scheme + ':' + val
  145.  
  146. url = urlparse(val)
  147. if not url.netloc:
  148. url = urlparse(urljoin(self.base_url, url.path))
  149. self.links.add(url.geturl())
  150.  
  151. if tag == 'title':
  152. self.recording_title = True
  153.  
  154. def handle_endtag(self, tag):
  155. if tag == 'title':
  156. self.recording_title = False
  157.  
  158.  
  159. def handle_data(self, data):
  160. if self.recording_title:
  161. self.title = data.strip()
  162.  
  163. def _do_nothing(self, *_, **__):
  164. return
  165.  
  166.  
  167.  
  168. class CrawlerQueue(object):
  169. def __init__(self, seed, **kw):
  170. self.seed = seed
  171. self.tocrawl = [seed]
  172. self.crawled = list()
  173. self.non_html_links = list()
  174. self.domain = urlparse(seed).netloc
  175. self.same_domain = kw.get('same_domain', True)
  176. self.exclude_parant_links = kw.get('exclude_parant_links', True)
  177.  
  178.  
  179. def next(self):
  180. random.shuffle(self.tocrawl)
  181. link = self.tocrawl.pop()
  182. self.crawled.append(link)
  183. return link
  184.  
  185.  
  186. def is_same_domain(self, link):
  187. return urlparse(link).netloc == self.domain
  188.  
  189.  
  190. def add_link(self, link):
  191. guessed_type = mimetypes.guess_type(link)[0] or 'text/html'
  192. if not guessed_type == 'text/html':
  193. return
  194. else:
  195. self.non_html_links.append(link)
  196.  
  197. if link in self.crawled:
  198. return
  199.  
  200.  
  201. if self.exclude_parant_links and PARANT in link:
  202. return
  203.  
  204. if not self.same_domain:
  205. self.tocrawl.append(link)
  206.  
  207. else:
  208. if self.is_same_domain(link):
  209. self.tocrawl.append(link)
  210.  
  211. def add_links(self, links):
  212. [self.add_link(link) for link in links]
  213.  
  214. @property
  215. def total_crawled(self):
  216. return len(self.crawled)
  217.  
  218. @property
  219. def in_queue(self):
  220. return len(self.tocrawl)
  221.  
  222. @property
  223. def total_non_html_links(self):
  224. return len(self.non_html_links)
  225.  
  226. @property
  227. def has_links(self):
  228. return bool(self.tocrawl)
  229.  
  230. @property
  231. def empty(self):
  232. return self.has_links is False
  233.  
  234.  
  235. q = CrawlerQueue('http://reddit.com', same_domain=0)
  236. not_found = set([])
  237. while q.has_links:
  238. crawling = q.next()
  239. page = HTMLParser(crawling)
  240. [not_found.add(link) for link in page.client.not_found]
  241. q.add_links(page.links)
  242. title = page.title
  243. if title:
  244. print(title, not_found)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement