Guest User

Untitled

a guest
Aug 4th, 2025
47
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 10.13 KB | None | 0 0
  1. # scrapy_chatgpt_spider.py
  2. """
  3. Scrapy Spider for ChatGPT conversations
  4. Optimized for Scrapy Cloud deployment
  5. """
  6.  
  7. import scrapy
  8. from scrapy import signals
  9. from scrapy.crawler import CrawlerProcess
  10. import json
  11. from urllib.parse import urlparse
  12. import time
  13.  
  14. class ChatGPTSpider(scrapy.Spider):
  15. name = 'chatgpt_scraper'
  16. custom_settings = {
  17. 'CONCURRENT_REQUESTS': 32,
  18. 'CONCURRENT_REQUESTS_PER_DOMAIN': 16,
  19. 'DOWNLOAD_DELAY': 0.5,
  20. 'RANDOMIZE_DOWNLOAD_DELAY': True,
  21. 'COOKIES_ENABLED': False,
  22. 'RETRY_TIMES': 3,
  23. 'RETRY_HTTP_CODES': [500, 502, 503, 504, 408, 429],
  24.  
  25. # Scrapy-Playwright settings
  26. 'DOWNLOAD_HANDLERS': {
  27. 'https': 'scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler',
  28. },
  29. 'PLAYWRIGHT_BROWSER_TYPE': 'chromium',
  30. 'PLAYWRIGHT_LAUNCH_OPTIONS': {
  31. 'headless': True,
  32. 'args': [
  33. '--no-sandbox',
  34. '--disable-setuid-sandbox',
  35. '--disable-dev-shm-usage',
  36. '--disable-accelerated-2d-canvas',
  37. '--no-first-run',
  38. '--no-zygote',
  39. '--disable-gpu'
  40. ]
  41. },
  42.  
  43. # AutoThrottle for adaptive delays
  44. 'AUTOTHROTTLE_ENABLED': True,
  45. 'AUTOTHROTTLE_START_DELAY': 0.5,
  46. 'AUTOTHROTTLE_MAX_DELAY': 10.0,
  47. 'AUTOTHROTTLE_TARGET_CONCURRENCY': 16.0,
  48.  
  49. # Stats for monitoring
  50. 'STATS_CLASS': 'scrapy.statscollectors.MemoryStatsCollector',
  51.  
  52. # Export settings
  53. 'FEEDS': {
  54. 's3://your-bucket/chatgpt-results/%(batch_id)s/%(name)s_%(time)s.jsonl': {
  55. 'format': 'jsonlines',
  56. 'store_empty': False,
  57. 'batch_item_count': 1000,
  58. }
  59. }
  60. }
  61.  
  62. def __init__(self, urls_file=None, batch_id='default', *args, **kwargs):
  63. super().__init__(*args, **kwargs)
  64. self.batch_id = batch_id
  65.  
  66. # Load URLs
  67. if urls_file:
  68. with open(urls_file) as f:
  69. self.start_urls = [url.strip() for url in f if url.strip()]
  70.  
  71. # Stats tracking
  72. self.processed_count = 0
  73. self.success_count = 0
  74. self.error_count = 0
  75.  
  76. async def parse(self, response):
  77. """Parse ChatGPT conversation page"""
  78. # Check if using Playwright
  79. if hasattr(response, 'meta') and 'playwright' in response.meta:
  80. page = response.meta['playwright_page']
  81.  
  82. # Wait for content
  83. await page.wait_for_selector('main', timeout=15000)
  84. await page.wait_for_timeout(2000)
  85.  
  86. # Extract data
  87. conversation_data = await page.evaluate('''() => {
  88. const extractMessages = () => {
  89. const messages = [];
  90. const selectors = [
  91. '[data-message-author-role]',
  92. '.text-base',
  93. '[class*="markdown"]',
  94. '.group'
  95. ];
  96.  
  97. let elements = [];
  98. for (const selector of selectors) {
  99. elements = document.querySelectorAll(selector);
  100. if (elements.length > 0) break;
  101. }
  102.  
  103. elements.forEach((el, idx) => {
  104. const isUser = el.closest('[data-message-author-role="user"]') ||
  105. (idx % 2 === 0);
  106.  
  107. messages.push({
  108. role: isUser ? 'user' : 'assistant',
  109. content: el.textContent.trim(),
  110. index: idx
  111. });
  112. });
  113.  
  114. return messages;
  115. };
  116.  
  117. return {
  118. title: document.title,
  119. messages: extractMessages(),
  120. conversation_id: window.location.pathname.split('/').pop()
  121. };
  122. }''')
  123.  
  124. await page.close()
  125.  
  126. # Yield result
  127. self.success_count += 1
  128. yield {
  129. 'url': response.url,
  130. 'title': conversation_data['title'],
  131. 'messages': conversation_data['messages'],
  132. 'conversation_id': conversation_data['conversation_id'],
  133. 'batch_id': self.batch_id,
  134. 'timestamp': time.time(),
  135. 'status': 'success'
  136. }
  137. else:
  138. # Fallback to regular parsing
  139. self.error_count += 1
  140. yield {
  141. 'url': response.url,
  142. 'status': 'error',
  143. 'error': 'Playwright not available',
  144. 'batch_id': self.batch_id,
  145. 'timestamp': time.time()
  146. }
  147.  
  148. self.processed_count += 1
  149.  
  150. # Log progress every 100 URLs
  151. if self.processed_count % 100 == 0:
  152. self.logger.info(
  153. f"Progress: {self.processed_count} processed, "
  154. f"{self.success_count} successful, {self.error_count} errors"
  155. )
  156.  
  157. @classmethod
  158. def from_crawler(cls, crawler, *args, **kwargs):
  159. spider = super().from_crawler(crawler, *args, **kwargs)
  160. crawler.signals.connect(spider.spider_closed, signal=signals.spider_closed)
  161. return spider
  162.  
  163. def spider_closed(self, spider):
  164. """Generate final statistics"""
  165. stats = {
  166. 'spider': self.name,
  167. 'batch_id': self.batch_id,
  168. 'total_processed': self.processed_count,
  169. 'successful': self.success_count,
  170. 'errors': self.error_count,
  171. 'success_rate': (self.success_count / self.processed_count * 100) if self.processed_count > 0 else 0
  172. }
  173.  
  174. self.logger.info(f"Spider closed. Stats: {json.dumps(stats, indent=2)}")
  175.  
  176.  
  177. # Distributed runner for local testing
  178. class DistributedScrapyRunner:
  179. def __init__(self, urls_file, num_processes=4):
  180. self.urls_file = urls_file
  181. self.num_processes = num_processes
  182.  
  183. # Load and split URLs
  184. with open(urls_file) as f:
  185. all_urls = [url.strip() for url in f if url.strip()]
  186.  
  187. # Split into chunks
  188. chunk_size = len(all_urls) // num_processes
  189. self.url_chunks = []
  190.  
  191. for i in range(num_processes):
  192. start = i * chunk_size
  193. end = start + chunk_size if i < num_processes - 1 else len(all_urls)
  194.  
  195. chunk_file = f'urls_chunk_{i}.txt'
  196. with open(chunk_file, 'w') as f:
  197. f.write('\n'.join(all_urls[start:end]))
  198.  
  199. self.url_chunks.append(chunk_file)
  200.  
  201. def run_spider(self, chunk_file, process_id):
  202. """Run spider for a chunk"""
  203. process = CrawlerProcess({
  204. 'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
  205. 'LOG_LEVEL': 'INFO',
  206. 'FEEDS': {
  207. f'results_process_{process_id}.jsonl': {
  208. 'format': 'jsonlines',
  209. }
  210. }
  211. })
  212.  
  213. process.crawl(ChatGPTSpider, urls_file=chunk_file, batch_id=f'process_{process_id}')
  214. process.start()
  215.  
  216. def run_distributed(self):
  217. """Run spiders in parallel processes"""
  218. import multiprocessing
  219.  
  220. processes = []
  221. for i, chunk_file in enumerate(self.url_chunks):
  222. p = multiprocessing.Process(
  223. target=self.run_spider,
  224. args=(chunk_file, i)
  225. )
  226. p.start()
  227. processes.append(p)
  228. print(f"Started process {i} for {chunk_file}")
  229.  
  230. # Wait for all processes
  231. for p in processes:
  232. p.join()
  233.  
  234. print("All processes completed")
  235.  
  236.  
  237. # Scrapy Cloud deployment configuration
  238. """
  239. scrapinghub.yml:
  240. ---
  241. projects:
  242. default: YOUR_PROJECT_ID
  243.  
  244. stacks:
  245. default: scrapy:2.11-playwright
  246.  
  247. requirements:
  248. file: requirements.txt
  249.  
  250. """
  251.  
  252. # requirements.txt for Scrapy Cloud
  253. """
  254. scrapy>=2.11.0
  255. scrapy-playwright>=0.0.26
  256. boto3>=1.26.0
  257. """
  258.  
  259. # Deploy script
  260. def create_deploy_script():
  261. deploy_script = """#!/bin/bash
  262. # Deploy to Scrapy Cloud
  263.  
  264. # Split URLs into batches
  265. split -l 2000 urls.txt urls_batch_
  266.  
  267. # Deploy spider
  268. shub deploy
  269.  
  270. # Schedule spider runs for each batch
  271. for batch in urls_batch_*; do
  272. echo "Scheduling spider for $batch"
  273.  
  274. # Upload batch to Scrapy Cloud
  275. shub items-api upload $batch
  276.  
  277. # Schedule spider
  278. shub schedule chatgpt_scraper \
  279. -a urls_file=$batch \
  280. -a batch_id=$(basename $batch) \
  281. -a CONCURRENT_REQUESTS=32 \
  282. -a DOWNLOAD_DELAY=0.5
  283.  
  284. sleep 2
  285. done
  286.  
  287. echo "All spiders scheduled!"
  288.  
  289. # Monitor progress
  290. watch -n 30 'shub jobs | head -20'
  291. """
  292.  
  293. with open('deploy_to_scrapy_cloud.sh', 'w') as f:
  294. f.write(deploy_script)
  295.  
  296. print("Created deploy_to_scrapy_cloud.sh")
  297.  
  298.  
  299. if __name__ == "__main__":
  300. import sys
  301.  
  302. if len(sys.argv) < 2:
  303. print("Usage:")
  304. print(" python scrapy_chatgpt_spider.py urls.txt # Run locally")
  305. print(" python scrapy_chatgpt_spider.py --deploy # Create deploy script")
  306. print(" python scrapy_chatgpt_spider.py --distributed 8 # Run with 8 processes")
  307. sys.exit(1)
  308.  
  309. if sys.argv[1] == '--deploy':
  310. create_deploy_script()
  311. elif sys.argv[1] == '--distributed':
  312. num_processes = int(sys.argv[2]) if len(sys.argv) > 2 else 4
  313. runner = DistributedScrapyRunner('urls.txt', num_processes)
  314. runner.run_distributed()
  315. else:
  316. # Run single spider
  317. process = CrawlerProcess()
  318. process.crawl(ChatGPTSpider, urls_file=sys.argv[1])
  319. process.start()
  320.  
Advertisement
Add Comment
Please, Sign In to add comment