Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import pandas as pd
- import requests
- from bs4 import BeautifulSoup
- def get_last_page(soup: BeautifulSoup) -> int:
- """Return last page of hotels."""
- try:
- return int(soup.find('ol', class_='pagination').find_all('li')[-1].text.strip())
- except TypeError:
- raise TypeError('Could not find last page')
- except AttributeError:
- raise AttributeError('Could not find last page')
- # TODO: catch request or BS4 exceptions:
- def cook_soup(url: str) -> BeautifulSoup:
- try:
- page = requests.get(url)
- return BeautifulSoup(page.content, 'html.parser')
- except:
- return None
- def parse_itemprop(soup: BeautifulSoup, item: str, tag: str='span') -> str:
- """Extract text from fields with 'itemprop' attribute. The field might be missing!"""
- try:
- return soup.find(tag, itemprop=item).text
- except AttributeError:
- return None
- def extract_email(soup: BeautifulSoup) -> str:
- """Emails are tricky: contains UTF8 non-printable characters, might be missing."""
- try:
- return (
- soup
- .find('div', class_='mobile-portrait-row mail')
- .find('a')
- .text
- .encode('ascii', 'ignore')
- )
- except AttributeError:
- return None
- def parse_hotel(url: str) -> dict:
- # Extracting important part of the page:
- soup = cook_soup(url)
- # If there were any issue with fetching data:
- if soup is None: return None
- # Extract header that contain hotel details:
- media_body = soup.find('div', class_ = 'media-body')
- return dict(
- name = media_body.find('div', itemprop='name').text,
- streetAddress = parse_itemprop(media_body, 'streetAddress'),
- postalCode = parse_itemprop(media_body, 'postalCode'),
- addressLocality = parse_itemprop(media_body, 'addressLocality'),
- telephone = parse_itemprop(media_body, 'telephone'),
- faxNumber = parse_itemprop(media_body, 'faxNumber'),
- url = parse_itemprop(media_body, 'url', 'a'),
- email = extract_email(media_body),
- source_url = url
- )
- data_file = 'hotels.csv'
- base_url = 'https://www.firmenabc.at/firmen/at/hotels_CFW'
- last_page = get_last_page(cook_soup(base_url))
- print(f'Fetching hotels from {last_page} pages...')
- data = []
- # Looping through all pages:
- for page in range(1, last_page +1):
- print(f'Processind data on page: {page}')
- # Extract page data:
- soup = cook_soup(f'{base_url}/{str(page)}')
- # Extract details for each hotel on the page (~40 hotel/page):
- for hotel in soup.find_all('li', class_='card result'):
- data.append(parse_hotel(hotel.find('a', itemprop='url').get('href')))
- # There might be some nulls if hotel page cannot be converted to bs4:
- df = pd.DataFrame([x for x in data if x is not None])
- print(f'Information from {len(df)} hotels was retrieved.')
- print(f'Saving data as csv: {data_file}')
- df.to_csv(data_file, sep='\t', index=False)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement