Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import requests
- from bs4 import BeautifulSoup
- headers = {'User-Agent': 'Mozilla/5.0'}
- def my_spider(max_pages):
- for page in range(1, max_pages+1):
- url = 'http://www.yellowpages.com/search?search_terms=business&geo_location_terms=California%20City%2C%20CA&page=' + str(page)
- response = requests.get(url, headers=headers)
- soup = BeautifulSoup(response.text, "html.parser")
- for link in soup.findAll('a',{'class':'business-name'}):
- href = link.get('href')
- print('[DEBUG] href:', href)
- if not href.startswith('http'):
- href = "https://www.yellowpages.com"+ link.get('href')
- get_every_single_data(href)
- def get_every_single_data(any_url):
- response = requests.get(any_url, headers=headers)
- soup = BeautifulSoup(response.text, "html.parser" )
- for title in soup.findAll('div', {'class': 'sales-info'}):
- print("NAME: ", title.text)
- for address in soup.findAll('p', {'class': 'address'}):
- print("ADDRESS: ", address.text)
- if not soup.findAll('a', {'class': 'email-business'}):
- print("NILL ")
- else:
- for email in soup.findAll('a', {'class': 'email-business'}):
- print("EMAIL: ", email.get('href'))
- for phone in soup.findAll('p', {'class': 'phone'}):
- print("PHONE: ", phone.string)
- if not soup.findAll('a', {'class': 'secondary-btn website-link'}):
- print("WEBSITE: NILL")
- else:
- for website in soup.findAll('a', {'class': 'secondary-btn website-link'}):
- print("WEBSITE: ", website.get('href'))
- print('\n\n')
- my_spider(2)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement