Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # import the required libraries
- from bs4 import BeautifulSoup
- import requests
- base_URL = 'https://www.reuters.com'
- # get the news category
- category_URL = f'{base_URL}/technology/'
- page_count = 1
- data_links = []
- # specify ZenRows request parameters
- params = {
- 'url': category_URL,
- 'apikey': '<YOUR_ZENROWS_API_KEY>',
- 'js_render': 'true',
- 'premium_proxy': 'true',
- }
- # route the request through ZenRows
- response = requests.get('https://api.zenrows.com/v1/', params=params)
- if response.status_code == 200:
- soup = BeautifulSoup(response.content, 'html.parser')
- print(f'Request successful for {category_URL}')
- # find matching class names to bypass obfuscated class characters
- article_links = soup.find_all('a', class_=lambda value: value and 'media-story-card__headline' in value)
- # iterate through link cards to get article unique hrefs
- for link in article_links:
- href = link.get('href')
- # merge article's unique URL with the base URL
- data_links.append(base_URL + href)
- else:
- print(f'Error fetching links for page: {response.status_code}')
- print(data_links)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement