Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from bs4 import BeautifulSoup
- import requests
- class GScraper(object):
- def __init__(self):
- self._url = "http://g1.globo.com"
- self.response = None
- self.soup = None
- def get_response(self):
- response = requests.get(self._url)
- if response.status_code == 200:
- self.response = response.text.encode('utf-8').decode('ascii', 'ignore')
- self.soup = BeautifulSoup(self.response, 'lxml')
- return True
- else:
- return False
- def news(self):
- all_news2return = []
- all_news = self.soup.findAll('a', {'class': 'feed-post-link'})
- for news in all_news:
- all_news2return.append((news.text, news['href']))
- return all_news2return
- def details(self, link):
- if link:
- r = requests.get(link)
- if r.status_code == 200:
- response = r.text.encode('utf-8').decode('ascii', 'ignore')
- soup = BeautifulSoup(response, 'lxml')
- title = soup.find('h1', {'class': 'content-head__title'})
- paragraphs = soup.findAll('p', {'class': 'content-text__container'})
- print title.text
- for p in paragraphs:
- if p.text:
- print p.text
- else:
- return False
- if __name__ == "__main__":
- g1_bot = GScraper()
- g1_bot.get_response()
- links = g1_bot.news()
- last_link = links[0][1]
- g1_bot.details(last_link)
- else:
- print("I don't 'll run")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement