Advertisement
qharr

Untitled

Apr 20th, 2019
142
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 0.97 KB | None | 0 0
  1. from bs4 import BeautifulSoup
  2. import requests
  3. import pandas as pd
  4.  
  5. base = 'http://www.agriculture.gov.au'
  6. headers = {'User-Agent' : 'Mozilla/5.0'}
  7. specimens = []
  8. with requests.Session() as s:
  9. r = s.get('http://www.agriculture.gov.au/pests-diseases-weeds/plant#identify-pests-diseases', headers = headers)
  10. soup = BeautifulSoup(r.content, 'lxml')
  11. names, images, links = zip(*[ ( item.text.strip(), base + item.select_one('img')['src'] , item['href'] if 'http' in item['href'] else base + item['href']) for item in soup.select('.flex-item > a') ])
  12. for link in links:
  13. r = s.get(link)
  14. soup = BeautifulSoup(r.content, 'lxml')
  15. try:
  16. info = soup.select_one('.trigger:nth-of-type(3) + div').text
  17. except:
  18. info = 'None'
  19. print(link)
  20. specimens.append(info)
  21.  
  22. df = pd.DataFrame([names, images, links, specimens])
  23. df = df.transpose()
  24. df.columns = ['names', 'image_link', 'link', 'specimen']
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement