Guest User

Untitled

a guest
Oct 27th, 2024
30
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 4.50 KB | None | 0 0
  1. # Ruby Script for Web Content Extraction
  2. #
  3. # loads the page in puppetteer and extracts the content with readability.js
  4. # returns "" on error or when not a html file
  5.  
  6. class ExtractContent
  7. include Scrapers::RandomUseragent
  8.  
  9. def fetch_content(url)
  10. # If this was intended to be the primary method, uncomment and fix
  11. # return FetchFeedMasterdataJob.fetch_page(url)
  12.  
  13. execute_with_suppressed_logging do |browser|
  14. browser_session(browser, url) do |page, context|
  15. setup_page(page)
  16. content = extract_content(browser, page, url)
  17. handle_ip_blocks(content)
  18. end
  19. end
  20. rescue => e
  21. Rails.logger.error "ExtractContent.fetch_content failed: #{e.class} - #{e.message}"
  22. Rails.logger.error e.backtrace.join("\n") if Rails.env.development?
  23. ""
  24. end
  25.  
  26. private
  27.  
  28. def setup_page(page)
  29. # Handle cookies, JavaScript execution, etc.
  30. # For example, enable JavaScript
  31. page.javascript_enabled = true
  32.  
  33. # random browser window size
  34. width = rand(1024..1920)
  35. height = rand(768..1080)
  36. page.viewport = Puppeteer::Viewport.new(width: width, height: height)
  37.  
  38. # random mouse moving
  39. # page.mouse.move(x: rand(0..width), y: rand(0..height))
  40.  
  41. # random scrolling
  42. scroll_height = rand(0..height)
  43. # page.evaluate("window.scrollBy(0, #{scroll_height})")
  44.  
  45. # page.keyboard.press('ArrowDown')
  46. # ... any other setup code ...
  47. end
  48.  
  49. def execute_with_suppressed_logging
  50. suppress_logging do
  51. options = {
  52. executable_path: ENV["CHROME_PATH"],
  53. args: [
  54. '--disable-http2',
  55. '--no-sandbox',
  56. '--disable-dev-shm-usage', # Add this for Docker environments
  57. '--disable-gpu', # Add this for headless environments
  58. '--headless' # Make sure we're running headless
  59. ]
  60. }
  61.  
  62. Puppeteer.launch(**options) do |browser|
  63. yield(browser)
  64. end
  65. end
  66. end
  67.  
  68. def browser_session(browser, url)
  69. context = browser.default_browser_context
  70. page = context.new_page
  71. page.set_user_agent(random_useragent)
  72.  
  73. # Increase timeouts for production
  74. page.default_navigation_timeout = 20_000 # 60 seconds
  75. page.default_timeout = 20_000 # 60 seconds
  76.  
  77. yield(page, context)
  78. ensure
  79. page&.close rescue nil
  80. context&.close rescue nil
  81. end
  82.  
  83. def extract_content(browser, page, url)
  84. page.goto(url, wait_until: "networkidle2", timeout: 20_000)
  85. # readability_js = File.read(Rails.root.join('app/assets/javascripts/readability.js'))
  86. # page.evaluate(readability_js)
  87. readability_js = ""
  88. # if Rails.env.production?
  89. # readability_js = File.read(Rails.root.join('assets/javascripts/readability.js'))
  90. # else
  91. # readability_js = File.read(Rails.root.join('app/assets/javascripts/readability.js'))
  92. # end
  93.  
  94. page.add_script_tag(content: File.read(Rails.root.join("app/assets/javascripts/Readability.js")))
  95. # return page
  96.  
  97. # Execute readability
  98. page.evaluate("new Readability(document).parse().content")
  99. # This will print the main content of the page
  100. end
  101.  
  102. def handle_ip_blocks(content)
  103. if is_blocked_by_cloudfare?(content) || is_blocked_by_reddit?(content)
  104. return "-"
  105. end
  106.  
  107. content
  108. end
  109.  
  110. def is_blocked_by_cloudfare?(content)
  111. block_terms = [
  112. "<p>The owner of this website",
  113. "has banned your access based on your browser's signature",
  114. "<p>Ray ID: ",
  115. "<span>Ray ID: ",
  116. '<span data-translate="error">Error</span>',
  117. "has banned the autonomous system number (ASN) your IP address",
  118. "has banned your access based on your browser's signature",
  119. '<span><span>Performance &amp; security by</span> <a rel="noopener noreferrer" href="https://www.cloudflare.com'
  120. ]
  121.  
  122. # return true if any of the blockterms exist in the string content
  123. block_terms.any? { |term| content.include?(term) }
  124. end
  125.  
  126. def is_blocked_by_reddit?(content)
  127. block_terms = [
  128. "if you think that we've incorrectly blocked you or you would like to discuss",
  129. "Your request has been blocked due to a network policy",
  130. 'Try logging in or creating an account <a href="https://www.reddit.com/login/">here</a> to get back to browsing'
  131. ]
  132.  
  133. # return true if any of the blockterms exist in the string content
  134. block_terms.any? { |term| content.include?(term) }
  135. end
  136.  
  137. def suppress_logging
  138. original_stderr = STDERR.clone
  139. STDERR.reopen(File.new("/dev/null", "w"))
  140. yield
  141. ensure
  142. STDERR.reopen(original_stderr)
  143. end
  144. end
  145.  
  146. # Replace 'http://example.com' with the actual URL you want to process
  147. # result = extract_content_with_node('http://example.com')
  148. # puts result.is_a?(Hash) && result.key?('error') ? result : result['content']
  149.  
Advertisement
Add Comment
Please, Sign In to add comment