Mr_HO1A

Google News Scraping using BS4

Jun 18th, 2019
38
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.16 KB | None | 0 0
  1. # Author - Aman Vishwakarma
  2. # Laptop 3 : [Arch VM 3]
  3.  
  4.  
  5. # News Scrapping using BeautifulSoup
  6.  
  7. from bs4 import BeautifulSoup
  8. import requests
  9. import os
  10. import json
  11. import re
  12. import urllib
  13. import urllib.request
  14.  
  15.  
  16. def downloadImage(imageUrl,imageNumber):
  17.     print("Downloaded : "+imageUrl)
  18.     urllib.request.urlretrieve(imageUrl, "NEWSImage_"+str(imageNumber)+".jpg")
  19.  
  20. def main():
  21.     #url = "https://timesofindia.indiatimes.com/india"
  22.     #url_google_news = "https://news.google.com/news/headlines?hl=en-IN&gl=IN&ned=in"
  23.     url_google_news = "https://news.google.com/topics/CAAqJggKIiBDQkFTRWdvSUwyMHZNRGx1YlY4U0FtVnVHZ0pKVGlnQVAB?hl=en-IN&gl=IN&ceid=IN%3Aen"
  24.    
  25.     data = requests.get(url_google_news)
  26.     soup = BeautifulSoup(data.content, "lxml")
  27.     # instead of html_parser, lxml can also be used
  28.  
  29.     path = os.path.abspath(os.path.dirname(__file__))
  30.     # Create Image Directory
  31.     try:
  32.         os.mkdir(path+"/images")
  33.     except:
  34.         0
  35.  
  36.     filename = os.path.join(path, 'world news.txt')
  37.     imageCounter = 1
  38.     links = soup.find_all('div', class_="xrnccd F6Welf R7GTQ keNKEd j7vNaf")
  39.     with open(filename, 'w') as f:
  40.         for link in links:
  41.             headlines = link.h3.a.text
  42.             #print(headlines)
  43.             f.write(headlines)
  44.             f.write("\n")
  45.             image = link.find_all('img',class_="tvs3Id QwxBBf")
  46.             imageSource = "None"
  47.             for img in image:
  48.                 if img["src"] == "":
  49.                     imageSource = "None"
  50.                 else:
  51.                    imageSource = img["src"]
  52.                    downloadImage(imageSource,imageCounter)
  53.                    imageCounter+=1
  54.             f.write(imageSource)
  55.             source = link.find('div', class_="SVJrMe")
  56.             src=source.text
  57.             print(src,"  ")
  58.             f.write('\n')
  59.             f.write(src)
  60.             f.write('\n')
  61.             #print('\n')
  62.             #f.write('\n')
  63.             #text = link.text
  64.             #headline_length = len(text.split())
  65.             #if headline_length > 6:
  66.                 #f.write(text)
  67.                 #f.write('\n')
  68.     #f.close()
  69.    
  70. main()
Add Comment
Please, Sign In to add comment