Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # Author - Aman Vishwakarma
- # Laptop 3 : [Arch VM 3]
- # News Scrapping using BeautifulSoup
- from bs4 import BeautifulSoup
- import requests
- import os
- import json
- import re
- import urllib
- import urllib.request
- def downloadImage(imageUrl,imageNumber):
- print("Downloaded : "+imageUrl)
- urllib.request.urlretrieve(imageUrl, "NEWSImage_"+str(imageNumber)+".jpg")
- def main():
- #url = "https://timesofindia.indiatimes.com/india"
- #url_google_news = "https://news.google.com/news/headlines?hl=en-IN&gl=IN&ned=in"
- url_google_news = "https://news.google.com/topics/CAAqJggKIiBDQkFTRWdvSUwyMHZNRGx1YlY4U0FtVnVHZ0pKVGlnQVAB?hl=en-IN&gl=IN&ceid=IN%3Aen"
- data = requests.get(url_google_news)
- soup = BeautifulSoup(data.content, "lxml")
- # instead of html_parser, lxml can also be used
- path = os.path.abspath(os.path.dirname(__file__))
- # Create Image Directory
- try:
- os.mkdir(path+"/images")
- except:
- 0
- filename = os.path.join(path, 'world news.txt')
- imageCounter = 1
- links = soup.find_all('div', class_="xrnccd F6Welf R7GTQ keNKEd j7vNaf")
- with open(filename, 'w') as f:
- for link in links:
- headlines = link.h3.a.text
- #print(headlines)
- f.write(headlines)
- f.write("\n")
- image = link.find_all('img',class_="tvs3Id QwxBBf")
- imageSource = "None"
- for img in image:
- if img["src"] == "":
- imageSource = "None"
- else:
- imageSource = img["src"]
- downloadImage(imageSource,imageCounter)
- imageCounter+=1
- f.write(imageSource)
- source = link.find('div', class_="SVJrMe")
- src=source.text
- print(src," ")
- f.write('\n')
- f.write(src)
- f.write('\n')
- #print('\n')
- #f.write('\n')
- #text = link.text
- #headline_length = len(text.split())
- #if headline_length > 6:
- #f.write(text)
- #f.write('\n')
- #f.close()
- main()
Add Comment
Please, Sign In to add comment