Guest User

Untitled

a guest
Jan 24th, 2017
72
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.82 KB | None | 0 0
  1. """Render HTML for scraping"""
  2. # -*- coding: utf-8 -*-
  3.  
  4. import os
  5. import sys
  6. from contextlib import contextmanager
  7. from multiprocessing import Pool
  8. import lxml.html
  9.  
  10. try:
  11. TimeoutError
  12. except NameError:
  13. from multiprocessing import TimeoutError # Python 2
  14.  
  15.  
  16. def _render(source_html):
  17. """Return rendered HTML."""
  18. from PyQt5.QtCore import QEventLoop, QUrl
  19. from PyQt5.QtWebEngineWidgets import QWebEngineView
  20. from PyQt5.QtWidgets import QApplication
  21.  
  22. class Render(QWebEngineView):
  23. """Render HTML with PyQt5 WebEngine."""
  24.  
  25. def __init__(self, url):
  26. self.html = None
  27. self.app = QApplication(sys.argv)
  28. QWebEngineView.__init__(self)
  29. self.loadFinished.connect(self._loadFinished)
  30. self.loadFinished.connect(self._events_in_html)
  31. self.load(QUrl(url))
  32. self.show()
  33. while not self._events_in_html():
  34. self.app.processEvents(
  35. QEventLoop.ExcludeUserInputEvents |
  36. QEventLoop.ExcludeSocketNotifiers |
  37. QEventLoop.WaitForMoreEvents)
  38.  
  39. self.app.quit()
  40.  
  41. def _events_in_html(self):
  42. if self.html is None:
  43. return False
  44. #print(len(lxml.html.fromstring(html).xpath('//*[@class="event"]')))
  45. elif len(lxml.html.fromstring(self.html).xpath('//*//div[@class="events-result-set"]/*'))>0:
  46. #if len(lxml.html.fromstring(html).xpath('//*[@class="events-result-set"]/*[@class="event"]'))>0:
  47. return True
  48. else:
  49. return False
  50.  
  51. def _callable(self, data):
  52. self.html = data
  53.  
  54. def _loadFinished(self, result):
  55. self.page().toHtml(self._callable)
  56.  
  57. #with devnull():
  58. return Render(source_html).html
  59.  
  60. @contextmanager
  61. def devnull():
  62. """Temporarily redirect stdout and stderr to /dev/null."""
  63.  
  64. try:
  65. original_stderr = os.dup(sys.stderr.fileno())
  66. original_stdout = os.dup(sys.stdout.fileno())
  67. null = open(os.devnull, 'w')
  68. os.dup2(null.fileno(), sys.stderr.fileno())
  69. os.dup2(null.fileno(), sys.stdout.fileno())
  70. yield
  71.  
  72. finally:
  73. if original_stderr is not None:
  74. os.dup2(original_stderr, sys.stderr.fileno())
  75. if original_stdout is not None:
  76. os.dup2(original_stdout, sys.stdout.fileno())
  77. if null is not None:
  78. null.close()
  79.  
  80. def render(html):
  81. """Perform render in a new process to prevent hangs."""
  82.  
  83. tries = 3
  84.  
  85. for _ in range(tries):
  86. pool = Pool(1)
  87. try:
  88. return pool.apply_async(_render, args=(html,)).get(timeout=120)
  89. except TimeoutError:
  90. continue
  91. finally:
  92. pool.terminate()
  93.  
  94. raise TimeoutError('Timed out attempting to render HTML %d times' % tries)
Add Comment
Please, Sign In to add comment