Advertisement
dsuveges

hotel_scraper.py

Feb 3rd, 2023 (edited)
1,140
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 3.00 KB | None | 0 0
  1. import pandas as pd
  2. import requests
  3. from bs4 import BeautifulSoup
  4.  
  5. def get_last_page(soup: BeautifulSoup) -> int:
  6.     """Return last page of hotels."""
  7.     try:
  8.         return int(soup.find('ol', class_='pagination').find_all('li')[-1].text.strip())
  9.     except TypeError:
  10.         raise TypeError('Could not find last page')
  11.     except AttributeError:
  12.         raise AttributeError('Could not find last page')
  13.  
  14. # TODO: catch request or BS4 exceptions:
  15. def cook_soup(url: str) -> BeautifulSoup:
  16.     try:
  17.         page = requests.get(url)
  18.         return BeautifulSoup(page.content, 'html.parser')
  19.     except:
  20.         return None
  21.  
  22. def parse_itemprop(soup: BeautifulSoup, item: str, tag: str='span') -> str:
  23.     """Extract text from fields with 'itemprop' attribute. The field might be missing!"""
  24.     try:
  25.         return soup.find(tag, itemprop=item).text
  26.     except AttributeError:
  27.         return None
  28.  
  29. def extract_email(soup: BeautifulSoup) -> str:
  30.     """Emails are tricky: contains UTF8 non-printable characters, might be missing."""
  31.     try:
  32.         return (
  33.             soup
  34.             .find('div', class_='mobile-portrait-row mail')
  35.             .find('a')
  36.             .text
  37.             .encode('ascii', 'ignore')
  38.         )
  39.     except AttributeError:
  40.         return None
  41.  
  42. def parse_hotel(url: str) -> dict:
  43.    
  44.     # Extracting important part of the page:
  45.     soup = cook_soup(url)
  46.    
  47.     # If there were any issue with fetching data:
  48.     if soup is None: return None
  49.    
  50.     # Extract header that contain hotel details:
  51.     media_body = soup.find('div', class_ = 'media-body')
  52.    
  53.     return dict(
  54.         name = media_body.find('div', itemprop='name').text,
  55.         streetAddress = parse_itemprop(media_body, 'streetAddress'),
  56.         postalCode = parse_itemprop(media_body, 'postalCode'),
  57.         addressLocality = parse_itemprop(media_body, 'addressLocality'),
  58.         telephone = parse_itemprop(media_body, 'telephone'),
  59.         faxNumber = parse_itemprop(media_body, 'faxNumber'),
  60.         url = parse_itemprop(media_body, 'url', 'a'),
  61.         email = extract_email(media_body),
  62.         source_url = url
  63.     )
  64.  
  65. data_file = 'hotels.csv'
  66. base_url = 'https://www.firmenabc.at/firmen/at/hotels_CFW'
  67. last_page = get_last_page(cook_soup(base_url))
  68. print(f'Fetching hotels from {last_page} pages...')
  69.  
  70. data = []
  71. # Looping through all pages:
  72. for page in range(1, last_page +1):
  73.     print(f'Processind data on page: {page}')
  74.  
  75.     # Extract page data:
  76.     soup = cook_soup(f'{base_url}/{str(page)}')
  77.    
  78.     # Extract details for each hotel on the page (~40 hotel/page):
  79.     for hotel in soup.find_all('li', class_='card result'):
  80.         data.append(parse_hotel(hotel.find('a', itemprop='url').get('href')))
  81.  
  82. # There might be some nulls if hotel page cannot be converted to bs4:
  83. df = pd.DataFrame([x for x in data if x is not None])
  84.  
  85. print(f'Information from {len(df)} hotels was retrieved.')
  86. print(f'Saving data as csv: {data_file}')
  87. df.to_csv(data_file, sep='\t', index=False)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement