Guest User

Untitled

a guest
Nov 9th, 2025
71
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 18.35 KB | None | 0 0
  1. #!/usr/bin/env python3
  2. """
  3. Ripley.cl stealth browser with Cloudflare bypass and session persistence.
  4. Uses Camoufox for anti-detection.
  5. """
  6.  
  7. import asyncio
  8. import json
  9. import os
  10. import random
  11. import time
  12. from pathlib import Path
  13. from typing import Optional
  14. from camoufox.async_api import AsyncCamoufox
  15.  
  16.  
  17. async def save_session(page, context, session_file: str = "session_cookies.json"):
  18. """Save browser cookies and storage for session reuse."""
  19. try:
  20. # Get all cookies from the context
  21. cookies = await context.cookies()
  22.  
  23. # Also try to get localStorage and sessionStorage
  24. storage_data = {}
  25. try:
  26. storage_data = await page.evaluate("""
  27. () => {
  28. const local = {};
  29. const session = {};
  30. for (let i = 0; i < localStorage.length; i++) {
  31. const key = localStorage.key(i);
  32. local[key] = localStorage.getItem(key);
  33. }
  34. for (let i = 0; i < sessionStorage.length; i++) {
  35. const key = sessionStorage.key(i);
  36. session[key] = sessionStorage.getItem(key);
  37. }
  38. return { localStorage: local, sessionStorage: session };
  39. }
  40. """)
  41. except:
  42. pass
  43.  
  44. session_data = {
  45. 'cookies': cookies,
  46. 'storage': storage_data
  47. }
  48.  
  49. with open(session_file, 'w') as f:
  50. json.dump(session_data, f, indent=2)
  51.  
  52. total_items = len(cookies) + len(storage_data.get('localStorage', {})) + len(storage_data.get('sessionStorage', {}))
  53. print(f"[+] Session saved: {session_file} ({len(cookies)} cookies, {total_items} total items)")
  54. return True
  55. except Exception as e:
  56. print(f"[!] Failed to save session: {e}")
  57. return False
  58.  
  59.  
  60. async def load_session(page, context, session_file: str = "session_cookies.json"):
  61. """Load saved cookies and storage to restore session."""
  62. try:
  63. if not os.path.exists(session_file):
  64. print(f"[!] No session found: {session_file}")
  65. return False
  66.  
  67. with open(session_file, 'r') as f:
  68. session_data = json.load(f)
  69.  
  70. # Handle old format (just cookies array) or new format (dict with cookies and storage)
  71. if isinstance(session_data, list):
  72. cookies = session_data
  73. storage_data = {}
  74. else:
  75. cookies = session_data.get('cookies', [])
  76. storage_data = session_data.get('storage', {})
  77.  
  78. if not cookies and not storage_data:
  79. print("[!] Empty session file")
  80. return False
  81.  
  82. # Restore cookies
  83. if cookies:
  84. await context.add_cookies(cookies)
  85.  
  86. # Restore localStorage and sessionStorage
  87. if storage_data:
  88. try:
  89. await page.evaluate("""
  90. (storageData) => {
  91. const local = storageData.localStorage || {};
  92. const session = storageData.sessionStorage || {};
  93. for (const [key, value] of Object.entries(local)) {
  94. localStorage.setItem(key, value);
  95. }
  96. for (const [key, value] of Object.entries(session)) {
  97. sessionStorage.setItem(key, value);
  98. }
  99. }
  100. """, storage_data)
  101. except:
  102. pass
  103.  
  104. total_items = len(cookies) + len(storage_data.get('localStorage', {})) + len(storage_data.get('sessionStorage', {}))
  105. print(f"[+] Session loaded: {session_file} ({len(cookies)} cookies, {total_items} total items)")
  106. return True
  107. except Exception as e:
  108. print(f"[!] Failed to load session: {e}")
  109. return False
  110.  
  111.  
  112. def is_session_valid(
  113. session_file: str = "session_cookies.json",
  114. max_age_hours: int = 24,
  115. max_uses: int = None
  116. ):
  117. """Check if session file exists, is recent enough, and hasn't been overused."""
  118. try:
  119. if not os.path.exists(session_file):
  120. return False
  121.  
  122. file_age_hours = (time.time() - os.path.getmtime(session_file)) / 3600
  123.  
  124. if file_age_hours > max_age_hours:
  125. print(f"[!] Session expired ({file_age_hours:.1f}h old)")
  126. return False
  127.  
  128. # Check usage count if max_uses is set
  129. if max_uses:
  130. stats_file = session_file.replace('.json', '_stats.json')
  131. if os.path.exists(stats_file):
  132. try:
  133. with open(stats_file, 'r') as f:
  134. stats = json.load(f)
  135. use_count = stats.get('use_count', 0)
  136. if use_count >= max_uses:
  137. print(f"[!] Session overused ({use_count}/{max_uses} requests)")
  138. return False
  139. except:
  140. pass
  141.  
  142. print(f"[+] Valid session found ({file_age_hours:.1f}h old)")
  143. return True
  144. except Exception as e:
  145. print(f"[!] Error checking session: {e}")
  146. return False
  147.  
  148.  
  149. def increment_session_usage(session_file: str = "session_cookies.json"):
  150. """Track session usage count."""
  151. stats_file = session_file.replace('.json', '_stats.json')
  152. stats = {'use_count': 0, 'created_at': time.time(), 'last_used': time.time()}
  153.  
  154. if os.path.exists(stats_file):
  155. try:
  156. with open(stats_file, 'r') as f:
  157. stats = json.load(f)
  158. except:
  159. pass
  160.  
  161. stats['use_count'] = stats.get('use_count', 0) + 1
  162. stats['last_used'] = time.time()
  163.  
  164. try:
  165. with open(stats_file, 'w') as f:
  166. json.dump(stats, f, indent=2)
  167. return stats['use_count']
  168. except:
  169. return None
  170.  
  171.  
  172. def rotate_session(session_file: str = "session_cookies.json"):
  173. """Delete old session to force creation of new one."""
  174. try:
  175. if os.path.exists(session_file):
  176. os.remove(session_file)
  177. print(f"[+] Session rotated: {session_file}")
  178.  
  179. stats_file = session_file.replace('.json', '_stats.json')
  180. if os.path.exists(stats_file):
  181. os.remove(stats_file)
  182.  
  183. return True
  184. except Exception as e:
  185. print(f"[!] Failed to rotate session: {e}")
  186. return False
  187.  
  188.  
  189. async def wait_for_cloudflare(page, max_wait: int = 40, skip_check: bool = False):
  190. """Wait for Cloudflare challenge completion and SPA load."""
  191. if skip_check:
  192. print("[+] Using saved session")
  193. return True
  194.  
  195. print("Checking for Cloudflare...")
  196. start_time = time.time()
  197. await asyncio.sleep(2)
  198.  
  199. last_content_length = 0
  200. stable_count = 0
  201.  
  202. while time.time() - start_time < max_wait:
  203. try:
  204. try:
  205. title = await page.title()
  206. except:
  207. title = ""
  208.  
  209. try:
  210. content = await page.content()
  211. except:
  212. print(f" Page loading... ({int(time.time() - start_time)}s)")
  213. await asyncio.sleep(2)
  214. continue
  215.  
  216. content_length = len(content)
  217.  
  218. cloudflare_indicators = [
  219. "Just a moment",
  220. "Checking your browser",
  221. "Please wait",
  222. "Verifying you are human",
  223. "cf-browser-verification",
  224. "challenge-running",
  225. "challenge-platform",
  226. ]
  227.  
  228. has_challenge = any(indicator.lower() in content.lower() or
  229. indicator.lower() in title.lower()
  230. for indicator in cloudflare_indicators)
  231.  
  232. has_real_content = any([
  233. "ripley" in content.lower() and content_length > 50000,
  234. "producto" in content.lower() and content_length > 50000,
  235. "gtag" in content.lower() and content_length > 50000,
  236. "dataLayer" in content.lower() and content_length > 50000,
  237. ])
  238.  
  239. if abs(content_length - last_content_length) < 1000:
  240. stable_count += 1
  241. else:
  242. stable_count = 0
  243.  
  244. last_content_length = content_length
  245.  
  246. if not has_challenge and has_real_content:
  247. print(f"[+] Cloudflare bypassed ({content_length:,} chars)")
  248. return True
  249.  
  250. if stable_count >= 3 and content_length > 100000:
  251. print(f"[+] Content loaded ({content_length:,} chars)")
  252. return True
  253.  
  254. elapsed = int(time.time() - start_time)
  255. if elapsed > 0 and elapsed % 5 == 0:
  256. status = "challenge" if has_challenge else "loading"
  257. print(f" [{status}] {content_length:,} chars, {elapsed}s")
  258.  
  259. await asyncio.sleep(1)
  260.  
  261. except Exception as e:
  262. print(f" Error: {str(e)[:50]}")
  263. await asyncio.sleep(2)
  264.  
  265. print(f"[!] Timeout after {max_wait}s")
  266. return False
  267.  
  268.  
  269. async def human_like_behavior(page):
  270. """Simulate human interaction patterns."""
  271. print("Simulating human behavior...")
  272. await asyncio.sleep(random.uniform(0.3, 0.8))
  273.  
  274. for _ in range(random.randint(1, 2)):
  275. x = random.randint(100, 800)
  276. y = random.randint(100, 600)
  277. await page.mouse.move(x, y)
  278. await asyncio.sleep(random.uniform(0.05, 0.15))
  279.  
  280. viewport_height = await page.evaluate("window.innerHeight")
  281. scroll_positions = [
  282. random.randint(300, 800),
  283. random.randint(800, 1500),
  284. ]
  285.  
  286. for scroll_y in scroll_positions:
  287. await page.evaluate(f"window.scrollTo({{ top: {scroll_y}, behavior: 'smooth' }})")
  288. await asyncio.sleep(random.uniform(0.3, 0.7))
  289.  
  290. await page.evaluate("window.scrollTo({ top: 0, behavior: 'smooth' })")
  291. await asyncio.sleep(random.uniform(0.2, 0.5))
  292.  
  293. print("[+] Behavior simulation complete")
  294.  
  295.  
  296. async def browse_and_extract_html(
  297. url: str,
  298. output_file: str = "output.html",
  299. headless: bool = False,
  300. max_retries: int = 3,
  301. use_session: bool = True,
  302. session_file: str = "ripley_session.json",
  303. session_max_age_hours: int = 24,
  304. session_max_uses: int = None
  305. ) -> Optional[str]:
  306. """
  307. Browse URL with Camoufox stealth browser and extract HTML.
  308.  
  309. Args:
  310. url: Target URL
  311. output_file: HTML output path
  312. headless: Run without GUI
  313. max_retries: Retry attempts
  314. use_session: Enable session persistence
  315. session_file: Session storage path
  316. session_max_age_hours: Max session age before rotation (default: 24h)
  317. session_max_uses: Max requests per session before rotation (default: None/unlimited)
  318. """
  319.  
  320. has_saved_session = use_session and is_session_valid(
  321. session_file,
  322. max_age_hours=session_max_age_hours,
  323. max_uses=session_max_uses
  324. )
  325.  
  326. # Auto-rotate if session is invalid
  327. if use_session and not has_saved_session and os.path.exists(session_file):
  328. rotate_session(session_file)
  329.  
  330. for attempt in range(max_retries):
  331. try:
  332. print(f"\n{'='*70}")
  333. print(f"Attempt {attempt + 1}/{max_retries}")
  334. print(f"{'='*70}")
  335. print(f"Starting Camoufox...")
  336.  
  337. async with AsyncCamoufox(
  338. headless=headless,
  339. humanize=True,
  340. geoip=True,
  341. exclude_addons=[],
  342. os="windows",
  343. ) as browser:
  344. print(f"[+] Browser launched")
  345.  
  346. context = browser.contexts[0] if browser.contexts else await browser.new_context()
  347.  
  348. print(f"Navigating to: {url}")
  349.  
  350. page = await browser.new_page()
  351.  
  352. session_loaded = False
  353. if use_session and has_saved_session:
  354. session_loaded = await load_session(page, context, session_file)
  355.  
  356. await page.set_extra_http_headers({
  357. 'Accept-Language': 'es-CL,es;q=0.9,en;q=0.8',
  358. 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
  359. 'Accept-Encoding': 'gzip, deflate, br',
  360. 'DNT': '1',
  361. 'Connection': 'keep-alive',
  362. 'Upgrade-Insecure-Requests': '1',
  363. })
  364.  
  365. try:
  366. print("Loading page...")
  367. response = await page.goto(
  368. url,
  369. wait_until="commit",
  370. timeout=60000
  371. )
  372.  
  373. if response:
  374. status = response.status
  375. print(f"[+] Response: {status}")
  376. if status == 403:
  377. print(" [!] Cloudflare challenge detected")
  378.  
  379. except Exception as nav_error:
  380. print(f"[!] Navigation warning: {nav_error}")
  381.  
  382. await asyncio.sleep(2)
  383.  
  384. challenge_passed = await wait_for_cloudflare(
  385. page,
  386. max_wait=40,
  387. skip_check=session_loaded
  388. )
  389.  
  390. if challenge_passed:
  391. print("Waiting for dynamic content...")
  392. await asyncio.sleep(1)
  393. else:
  394. print("Waiting for content...")
  395. await asyncio.sleep(2)
  396.  
  397. await human_like_behavior(page)
  398.  
  399. print("Extracting HTML...")
  400. html_content = await page.content()
  401.  
  402. if len(html_content) < 500:
  403. raise Exception("Content too short - blocked")
  404.  
  405. print(f"[+] Extracted {len(html_content):,} chars")
  406.  
  407. if "cloudflare" in html_content.lower() and len(html_content) < 5000:
  408. print("[!] Warning: Possible Cloudflare page")
  409. else:
  410. print("[+] Content OK")
  411.  
  412. with open(output_file, 'w', encoding='utf-8') as f:
  413. f.write(html_content)
  414.  
  415. print(f"[+] Saved: {output_file}")
  416.  
  417. screenshot_file = output_file.replace('.html', '.png')
  418. try:
  419. await page.screenshot(path=screenshot_file, full_page=False)
  420. print(f"[+] Screenshot: {screenshot_file}")
  421. except:
  422. print("[!] Screenshot failed")
  423.  
  424. if use_session:
  425. await save_session(page, context, session_file)
  426. # Track session usage
  427. use_count = increment_session_usage(session_file)
  428. if use_count and session_max_uses:
  429. print(f"[+] Session usage: {use_count}/{session_max_uses}")
  430. elif use_count:
  431. print(f"[+] Session usage: {use_count}")
  432.  
  433. await page.close()
  434.  
  435. print(f"\n{'='*70}")
  436. print("[SUCCESS]")
  437. print(f"{'='*70}\n")
  438.  
  439. return html_content
  440.  
  441. except Exception as e:
  442. print(f"\n[ERROR] Attempt {attempt + 1}: {e}")
  443.  
  444. if attempt < max_retries - 1:
  445. wait_time = random.uniform(3, 6) * (attempt + 1)
  446. print(f"Retrying in {wait_time:.1f}s...")
  447. await asyncio.sleep(wait_time)
  448. else:
  449. print(f"\n[FAILED] All attempts exhausted")
  450. raise
  451.  
  452. return None
  453.  
  454.  
  455. async def main():
  456. """Main entry point."""
  457. print("\n" + "="*70)
  458. print("RIPLEY.CL STEALTH BROWSER")
  459. print("="*70 + "\n")
  460.  
  461. url = "https://simple.ripley.cl/search/adagio%20teas"
  462. output_file = "ripley_search.html"
  463. session_file = "ripley_session.json"
  464. headless = True
  465. use_session = True
  466.  
  467. # Session rotation config (adjust based on use case)
  468. session_max_age_hours = 24 # Rotate after 24 hours
  469. session_max_uses = 50 # Rotate after 50 requests (set to None for unlimited)
  470.  
  471. print(f"URL: {url}")
  472. print(f"Output: {output_file}")
  473. print(f"Session: {session_file}")
  474. print(f"Headless: {headless}")
  475. print(f"Session rotation: {session_max_age_hours}h or {session_max_uses or 'unlimited'} uses")
  476. print(f"Features: Human behavior, GeoIP, Anti-detection")
  477.  
  478. try:
  479. html_content = await browse_and_extract_html(
  480. url,
  481. output_file,
  482. headless=headless,
  483. max_retries=3,
  484. use_session=use_session,
  485. session_file=session_file,
  486. session_max_age_hours=session_max_age_hours,
  487. session_max_uses=session_max_uses
  488. )
  489.  
  490. if html_content:
  491. print("\n" + "="*70)
  492. print("SUMMARY")
  493. print("="*70)
  494. print(f"Chars: {len(html_content):,}")
  495. print(f"Lines: {len(html_content.splitlines()):,}")
  496.  
  497. print("\n" + "="*70)
  498. print("PREVIEW (first 800 chars)")
  499. print("="*70)
  500. print(html_content[:800])
  501. if len(html_content) > 800:
  502. print("...")
  503.  
  504. print("\n" + "="*70)
  505. print("CONTENT CHECK")
  506. print("="*70)
  507.  
  508. indicators = {
  509. "Ripley branding": "ripley" in html_content.lower(),
  510. "Product data": "product" in html_content.lower(),
  511. "Search results": "search" in html_content.lower() or "resultado" in html_content.lower(),
  512. "Price info": "$" in html_content or "precio" in html_content.lower(),
  513. "JS framework": any(x in html_content for x in ["__NEXT_DATA__", "react", "angular", "vue"]),
  514. }
  515.  
  516. for indicator, present in indicators.items():
  517. status = "[+]" if present else "[-]"
  518. print(f"{status} {indicator}")
  519.  
  520. print("\n" + "="*70)
  521. print("[SUCCESS]")
  522. print("="*70 + "\n")
  523. else:
  524. print("\n[FAILED]")
  525.  
  526. except Exception as e:
  527. print(f"\n{'='*70}")
  528. print(f"[FATAL] {e}")
  529. print(f"{'='*70}\n")
  530. raise
  531.  
  532.  
  533. if __name__ == "__main__":
  534. asyncio.run(main())
  535.  
Advertisement
Add Comment
Please, Sign In to add comment