Advertisement
sbmonzur

ScrapingMultiplePages

Mar 26th, 2021
532
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.44 KB | None | 0 0
  1. import requests, csv, os
  2. from bs4 import BeautifulSoup
  3. from time import strftime, sleep
  4.  
  5. # make a GET request (requests.get("URL")) and store the response in a response object (req)
  6. responsePA = requests.get('https://www.prothomalo.com/search?q=%E0%A6%A7%E0%A6%B0%E0%A7%8D%E0%A6%B7%E0%A6%A3')
  7.  
  8. # read the content of the server’s response
  9. rawPagePA = responsePA.text
  10.  
  11. soupPA = BeautifulSoup(rawPagePA)
  12. # take a look
  13. print (soupPA.prettify())
  14.  
  15. urlsPA = [] #creating empty list to store URLs
  16. for item in soupPA.find_all("div", class_= "customStoryCard9-m__story-data__2qgWb"): #first part of loop selects all items with class=field-title
  17.     aTag = item.find("a") #extracting elements containing 'a' tags
  18.     urlsPA.append(aTag.attrs["href"])
  19.  
  20. print(urlsPA)
  21.  
  22. PAlist=[]
  23. for link in urlsPA:
  24.     specificpagePA=requests.get(link) #making a get request and stores the response in an object
  25.     rawAddPagePA=specificpagePA.text # read the content of the server’s response
  26.     PASoup2=BeautifulSoup(rawAddPagePA) # parse the response into an HTML tree
  27.     PAcontent=PASoup2.find_all(class_=["story-element story-element-text", "time-social-share-wrapper storyPageMetaData-m__time-social-share-wrapper__2-RAX", "headline headline-type-9 story-headline bn-story-headline headline-m__headline__3vaq9 headline-m__headline-type-9__3gT8S", "contributor-name contributor-m__contributor-name__1-593"])
  28.     #print(PAcontent)
  29.     PAlist.append(PAcontent)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement