Kalidor_Vorlich

Webscrape 1

Aug 21st, 2020 (edited)
164
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.29 KB | None | 0 0
  1.  
  2. #!/usr/bin/env python
  3.  
  4. """
  5. =====================================
  6. Webscraping 1
  7. =====================================
  8. Usage: %prog
  9. :Author: MerhuBerahu, https://github.com/MerhuBerahu
  10. :Date: 21/08/2020
  11. """
  12.  
  13. #import statements
  14. import requests
  15. import bs4
  16.  
  17. url = "https://robertsspaceindustries.com/galactapedia"
  18.  
  19. html_doc = requests.get(url)
  20. html_doc.raise_for_status()
  21.  
  22. #print(html_doc)
  23. soup = bs4.BeautifulSoup(html_doc.content, 'html.parser')
  24. type(soup)
  25.  
  26. links = []
  27. links2 = []
  28.  
  29. for link in soup.find_all('a'): # find all links in parsed data and create a list
  30.     #print(link.get('href'))
  31.     links.append(r"https://robertsspaceindustries.com" + link.get('href'))
  32.  
  33.  
  34.  
  35. for i in links: # for each link in links list parse that link then look for links on that page and create a new list of links2
  36.     html_doc = requests.get(i)
  37.     soup = bs4.BeautifulSoup(html_doc.content, 'html.parser')
  38.     soup.find_all('a')
  39.     print(soup.get('href'))
  40.     #links2.append(r"https://robertsspaceindustries.com" + i.get('href'))
  41.  
  42.  
  43. ### Commented out for time being ###
  44. """ for i in links:
  45.    title = html_soup.find("strong", class_="c-title c-title--x-large")
  46.    content = soup.find_all('div', class_='c-card__title')
  47.    print(title)
  48.    print(content) """
  49.  
  50. print(links)
  51. print(links2)
  52.  
Add Comment
Please, Sign In to add comment