Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # Ruby Script for Web Content Extraction
- #
- # loads the page in puppetteer and extracts the content with readability.js
- # returns "" on error or when not a html file
- class ExtractContent
- include Scrapers::RandomUseragent
- def fetch_content(url)
- # If this was intended to be the primary method, uncomment and fix
- # return FetchFeedMasterdataJob.fetch_page(url)
- execute_with_suppressed_logging do |browser|
- browser_session(browser, url) do |page, context|
- setup_page(page)
- content = extract_content(browser, page, url)
- handle_ip_blocks(content)
- end
- end
- rescue => e
- Rails.logger.error "ExtractContent.fetch_content failed: #{e.class} - #{e.message}"
- Rails.logger.error e.backtrace.join("\n") if Rails.env.development?
- ""
- end
- private
- def setup_page(page)
- # Handle cookies, JavaScript execution, etc.
- # For example, enable JavaScript
- page.javascript_enabled = true
- # random browser window size
- width = rand(1024..1920)
- height = rand(768..1080)
- page.viewport = Puppeteer::Viewport.new(width: width, height: height)
- # random mouse moving
- # page.mouse.move(x: rand(0..width), y: rand(0..height))
- # random scrolling
- scroll_height = rand(0..height)
- # page.evaluate("window.scrollBy(0, #{scroll_height})")
- # page.keyboard.press('ArrowDown')
- # ... any other setup code ...
- end
- def execute_with_suppressed_logging
- suppress_logging do
- options = {
- executable_path: ENV["CHROME_PATH"],
- args: [
- '--disable-http2',
- '--no-sandbox',
- '--disable-dev-shm-usage', # Add this for Docker environments
- '--disable-gpu', # Add this for headless environments
- '--headless' # Make sure we're running headless
- ]
- }
- Puppeteer.launch(**options) do |browser|
- yield(browser)
- end
- end
- end
- def browser_session(browser, url)
- context = browser.default_browser_context
- page = context.new_page
- page.set_user_agent(random_useragent)
- # Increase timeouts for production
- page.default_navigation_timeout = 20_000 # 60 seconds
- page.default_timeout = 20_000 # 60 seconds
- yield(page, context)
- ensure
- page&.close rescue nil
- context&.close rescue nil
- end
- def extract_content(browser, page, url)
- page.goto(url, wait_until: "networkidle2", timeout: 20_000)
- # readability_js = File.read(Rails.root.join('app/assets/javascripts/readability.js'))
- # page.evaluate(readability_js)
- readability_js = ""
- # if Rails.env.production?
- # readability_js = File.read(Rails.root.join('assets/javascripts/readability.js'))
- # else
- # readability_js = File.read(Rails.root.join('app/assets/javascripts/readability.js'))
- # end
- page.add_script_tag(content: File.read(Rails.root.join("app/assets/javascripts/Readability.js")))
- # return page
- # Execute readability
- page.evaluate("new Readability(document).parse().content")
- # This will print the main content of the page
- end
- def handle_ip_blocks(content)
- if is_blocked_by_cloudfare?(content) || is_blocked_by_reddit?(content)
- return "-"
- end
- content
- end
- def is_blocked_by_cloudfare?(content)
- block_terms = [
- "<p>The owner of this website",
- "has banned your access based on your browser's signature",
- "<p>Ray ID: ",
- "<span>Ray ID: ",
- '<span data-translate="error">Error</span>',
- "has banned the autonomous system number (ASN) your IP address",
- "has banned your access based on your browser's signature",
- '<span><span>Performance & security by</span> <a rel="noopener noreferrer" href="https://www.cloudflare.com'
- ]
- # return true if any of the blockterms exist in the string content
- block_terms.any? { |term| content.include?(term) }
- end
- def is_blocked_by_reddit?(content)
- block_terms = [
- "if you think that we've incorrectly blocked you or you would like to discuss",
- "Your request has been blocked due to a network policy",
- 'Try logging in or creating an account <a href="https://www.reddit.com/login/">here</a> to get back to browsing'
- ]
- # return true if any of the blockterms exist in the string content
- block_terms.any? { |term| content.include?(term) }
- end
- def suppress_logging
- original_stderr = STDERR.clone
- STDERR.reopen(File.new("/dev/null", "w"))
- yield
- ensure
- STDERR.reopen(original_stderr)
- end
- end
- # Replace 'http://example.com' with the actual URL you want to process
- # result = extract_content_with_node('http://example.com')
- # puts result.is_a?(Hash) && result.key?('error') ? result : result['content']
Advertisement
Add Comment
Please, Sign In to add comment