Advertisement
idowupaul

Untitled

Jan 3rd, 2024
521
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.14 KB | Source Code | 0 0
  1. # import the required libraries
  2. from bs4 import BeautifulSoup
  3. import requests
  4.  
  5. base_URL = 'https://www.reuters.com'
  6.  
  7. # get the news category
  8. category_URL = f'{base_URL}/technology/'
  9.  
  10. page_count = 1
  11.  
  12. data_links  = []
  13.  
  14. # specify ZenRows request parameters
  15. params = {
  16.     'url': category_URL,
  17.     'apikey': '<YOUR_ZENROWS_API_KEY>',
  18.     'js_render': 'true',
  19.     'premium_proxy': 'true',
  20. }
  21.  
  22. # route the request through ZenRows
  23. response = requests.get('https://api.zenrows.com/v1/', params=params)
  24.  
  25. if response.status_code == 200:
  26.     soup = BeautifulSoup(response.content, 'html.parser')
  27.     print(f'Request successful for {category_URL}')
  28.  
  29.     # find matching class names to bypass obfuscated class characters
  30.     article_links = soup.find_all('a', class_=lambda value: value and 'media-story-card__headline' in value)
  31.  
  32.     # iterate through link cards to get article unique hrefs
  33.     for link in article_links:
  34.         href = link.get('href')
  35.  
  36.         # merge article's unique URL with the base URL
  37.         data_links.append(base_URL + href)
  38. else:
  39.     print(f'Error fetching links for page: {response.status_code}')
  40.  
  41. print(data_links)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement