Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from bs4 import BeautifulSoup
- import requests
- import pandas as pd
- base = 'http://www.agriculture.gov.au'
- headers = {'User-Agent' : 'Mozilla/5.0'}
- specimens = []
- with requests.Session() as s:
- r = s.get('http://www.agriculture.gov.au/pests-diseases-weeds/plant#identify-pests-diseases', headers = headers)
- soup = BeautifulSoup(r.content, 'lxml')
- names, images, links = zip(*[ ( item.text.strip(), base + item.select_one('img')['src'] , item['href'] if 'http' in item['href'] else base + item['href']) for item in soup.select('.flex-item > a') ])
- for link in links:
- r = s.get(link)
- soup = BeautifulSoup(r.content, 'lxml')
- try:
- info = soup.select_one('.trigger:nth-of-type(3) + div').text
- except:
- info = 'None'
- print(link)
- specimens.append(info)
- df = pd.DataFrame([names, images, links, specimens])
- df = df.transpose()
- df.columns = ['names', 'image_link', 'link', 'specimen']
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement