Advertisement
alperiox

bs4 scrapethissite

Dec 14th, 2022 (edited)
879
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 0.93 KB | None | 0 0
  1. import requests
  2. from bs4 import BeautifulSoup
  3.  
  4. base_url = "https://www.scrapethissite.com"
  5. url = "https://www.scrapethissite.com/pages/"
  6.  
  7. response = requests.get(url)
  8.  
  9. soup = BeautifulSoup(response.text, "html.parser")
  10.  
  11. extracted_data = []
  12.  
  13. # start by finding the main div
  14. pages = soup.find_all("div", {"class":"page"})
  15.  
  16. # iterate through found page divs
  17. for page in pages:
  18.     # extract the title and the link using find method
  19.     title = page.find("h3").text.strip()
  20.     link = page.find("a").get("href")
  21.     # add the base website url to the link
  22.     # since it won't be as a full URL
  23.     link = base_url + link
  24.     # extract the description
  25.     description = page.find("p").text.strip()
  26.    
  27.     # append the data to the extracted_data list
  28.     data = {"title": title, "link": link, "description":description}
  29.     extracted_data.append(data)
  30.  
  31. # now let's print it out!
  32. for data in extracted_data:
  33.     print(data)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement