Advertisement
furas

scraping yellowpage

Apr 15th, 2017
191
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.67 KB | None | 0 0
  1. import requests
  2. from bs4 import BeautifulSoup
  3.  
  4. headers = {'User-Agent': 'Mozilla/5.0'}
  5.  
  6. def my_spider(max_pages):
  7.     for page in range(1, max_pages+1):
  8.         url = 'http://www.yellowpages.com/search?search_terms=business&geo_location_terms=California%20City%2C%20CA&page=' + str(page)
  9.  
  10.         response = requests.get(url, headers=headers)
  11.         soup = BeautifulSoup(response.text, "html.parser")
  12.  
  13.         for link in soup.findAll('a',{'class':'business-name'}):
  14.             href = link.get('href')
  15.             print('[DEBUG] href:', href)
  16.             if not href.startswith('http'):
  17.                 href = "https://www.yellowpages.com"+ link.get('href')
  18.                 get_every_single_data(href)
  19.  
  20.        
  21. def get_every_single_data(any_url):
  22.     response = requests.get(any_url, headers=headers)
  23.     soup = BeautifulSoup(response.text, "html.parser" )
  24.    
  25.     for title in soup.findAll('div', {'class': 'sales-info'}):
  26.         print("NAME: ", title.text)
  27.        
  28.     for address in soup.findAll('p', {'class': 'address'}):
  29.         print("ADDRESS: ", address.text)
  30.        
  31.     if not soup.findAll('a', {'class': 'email-business'}):
  32.         print("NILL ")
  33.     else:
  34.         for email in soup.findAll('a', {'class': 'email-business'}):
  35.              print("EMAIL: ", email.get('href'))
  36.     for phone in soup.findAll('p', {'class': 'phone'}):
  37.         print("PHONE: ", phone.string)
  38.     if not soup.findAll('a', {'class': 'secondary-btn website-link'}):
  39.         print("WEBSITE: NILL")
  40.     else:
  41.         for website in soup.findAll('a', {'class': 'secondary-btn website-link'}):
  42.             print("WEBSITE: ", website.get('href'))
  43.  
  44.     print('\n\n')
  45.  
  46. my_spider(2)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement