Guest User

zepers silly scraper :3

a guest
Aug 8th, 2024
128
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.25 KB | None | 0 0
  1. # written by zeper56
  2. # dont skid please >w<
  3. # (idk why you would even skid my code is DOGSHIT)
  4.  
  5. import requests
  6. from bs4 import BeautifulSoup
  7. import json
  8. import time
  9. import random
  10. from tqdm import tqdm
  11.  
  12. subs = ['webscraping'] # change this as u wish >w<
  13.  
  14. def wait(ms):
  15.     time.sleep(ms / 1000.0)
  16.  
  17. def scrape(sub):
  18.     url = f'https://old.reddit.com/r/{sub}/'
  19.     headers = {
  20.         'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 14_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Mobile/15E148 Safari/604.1'
  21.     }
  22.     posts = []
  23.     pbar = tqdm(total=float('inf'), desc=f'Scraping {sub}', unit='post')
  24.  
  25.     while True:
  26.         try:
  27.             res = requests.get(url, headers=headers)
  28.             res.raise_for_status()
  29.             soup = BeautifulSoup(res.text, 'html.parser')
  30.            
  31.             for post in soup.select('.thing'):
  32.                 title = post.select_one('.title')
  33.                 comments = post.select_one('.comments')
  34.                 posts.append({
  35.                     'title': title.get_text() if title else None,
  36.                     'url': title.a['href'] if title and title.a else None,
  37.                     'author': post.select_one('.author').get_text() if post.select_one('.author') else None,
  38.                     'upvotes': post.select_one('.score.unvoted').get_text() if post.select_one('.score.unvoted') else None,
  39.                     'comments': comments.get_text() if comments else None,
  40.                     'imgLinks': [img['src'] for img in post.select('img')],
  41.                     'commentCount': comments.get_text().split(' ')[0] if comments else '0'
  42.                 })
  43.                 pbar.update(1)
  44.            
  45.             with open(f'./{sub}.json', 'a') as f:
  46.                 json.dump(posts, f, indent=2)
  47.                 f.write('\n')  
  48.  
  49.             nextButton = soup.select_one('.next-button a')
  50.             if nextButton:
  51.                 url = nextButton['href']
  52.                 wait(3000 + random.uniform(2000, 5000))
  53.             else:
  54.                 break
  55.  
  56.         except Exception as e:
  57.             print(f"Error: {e}")
  58.             break
  59.  
  60.     pbar.close()
  61.  
  62. def scrapesubs():
  63.     for sub in subs:
  64.         scrape(sub)
  65.  
  66. if __name__ == "__main__":
  67.     scrapesubs()
Advertisement
Add Comment
Please, Sign In to add comment