Untitled

# Ruby Script for Web Content Extraction
#
# loads the page in puppetteer and extracts the content with readability.js
# returns "" on error or when not a html file

class ExtractContent
	include Scrapers::RandomUseragent

	def fetch_content(url)
		# If this was intended to be the primary method, uncomment and fix
		# return FetchFeedMasterdataJob.fetch_page(url)

		execute_with_suppressed_logging do |browser|
			browser_session(browser, url) do |page, context|
				setup_page(page)
				content = extract_content(browser, page, url)
				handle_ip_blocks(content)
			end
		end
	rescue => e
		Rails.logger.error "ExtractContent.fetch_content failed: #{e.class} - #{e.message}"
		Rails.logger.error e.backtrace.join("\n") if Rails.env.development?
		""
	end

	private

	def setup_page(page)
		# Handle cookies, JavaScript execution, etc.
		# For example, enable JavaScript
		page.javascript_enabled = true

		# random browser window size
		width = rand(1024..1920)
		height = rand(768..1080)
		page.viewport = Puppeteer::Viewport.new(width: width, height: height)

		# random mouse moving
		# page.mouse.move(x: rand(0..width), y: rand(0..height))

		# random scrolling
		scroll_height = rand(0..height)
		# page.evaluate("window.scrollBy(0, #{scroll_height})")

		# page.keyboard.press('ArrowDown')
		# ... any other setup code ...
	end

	def execute_with_suppressed_logging
		suppress_logging do
			options = {
				executable_path: ENV["CHROME_PATH"],
				args: [
					'--disable-http2',
					'--no-sandbox',
					'--disable-dev-shm-usage', # Add this for Docker environments
					'--disable-gpu', # Add this for headless environments
					'--headless' # Make sure we're running headless
				]
			}

			Puppeteer.launch(**options) do |browser|
				yield(browser)
			end
		end
	end

	def browser_session(browser, url)
		context = browser.default_browser_context
		page = context.new_page
		page.set_user_agent(random_useragent)

		# Increase timeouts for production
		page.default_navigation_timeout = 20_000 # 60 seconds
		page.default_timeout = 20_000 # 60 seconds

		yield(page, context)
	ensure
		page&.close rescue nil
		context&.close rescue nil
	end

	def extract_content(browser, page, url)
		page.goto(url, wait_until: "networkidle2", timeout: 20_000)
		# readability_js = File.read(Rails.root.join('app/assets/javascripts/readability.js'))
		# page.evaluate(readability_js)
		readability_js = ""
		# if Rails.env.production?
		#			readability_js = File.read(Rails.root.join('assets/javascripts/readability.js'))
		# else
		#	readability_js = File.read(Rails.root.join('app/assets/javascripts/readability.js'))
		# end

		page.add_script_tag(content: File.read(Rails.root.join("app/assets/javascripts/Readability.js")))
		# return page

		# Execute readability
		page.evaluate("new Readability(document).parse().content")
		# This will print the main content of the page
	end

	def handle_ip_blocks(content)
		if is_blocked_by_cloudfare?(content) || is_blocked_by_reddit?(content)
			return "-"
		end

		content
	end

	def is_blocked_by_cloudfare?(content)
		block_terms = [
			"<p>The owner of this website",
			"has banned your access based on your browser's signature",
			"<p>Ray ID: ",
			"<span>Ray ID: ",
			'<span data-translate="error">Error</span>',
			"has banned the autonomous system number (ASN) your IP address",
			"has banned your access based on your browser's signature",
			'<span><span>Performance &amp; security by</span> <a rel="noopener noreferrer" href="https://www.cloudflare.com'
		]

		# return true if any of the blockterms exist in the string content
		block_terms.any? { |term| content.include?(term) }
	end

	def is_blocked_by_reddit?(content)
		block_terms = [
			"if you think that we've incorrectly blocked you or you would like to discuss",
			"Your request has been blocked due to a network policy",
			'Try logging in or creating an account <a href="https://www.reddit.com/login/">here</a> to get back to browsing'
		]

		# return true if any of the blockterms exist in the string content
		block_terms.any? { |term| content.include?(term) }
	end

	def suppress_logging
		original_stderr = STDERR.clone
		STDERR.reopen(File.new("/dev/null", "w"))
		yield
	ensure
		STDERR.reopen(original_stderr)
	end
end

# Replace 'http://example.com' with the actual URL you want to process
# result = extract_content_with_node('http://example.com')
# puts result.is_a?(Hash) && result.key?('error') ? result : result['content']