Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- """ Methods for scraping product information from the tesco website.
- You will need to install requests, beautifulsoup4, lxml and pandas libraries
- $ pip install requests beautifulsoup4 lxml pandas
- """
- from typing import List, Optional
- import requests
- import pandas as pd
- from bs4 import BeautifulSoup
- from bs4.element import Tag
- def fetch_page(url: str) -> BeautifulSoup:
- response: requests.Response = requests.get(url)
- # Raise an exception if the response status is not 200
- response.raise_for_status()
- # Parse the html page
- page: BeautifulSoup = BeautifulSoup(response.text, "lxml")
- return page
- def extract_catagories(page: BeautifulSoup) -> List[str]:
- return list(page.find("ol").strings)
- def extract_net_weight(page: BeautifulSoup) -> str:
- return page.find(id="net-contents").p.text.split()[0]
- def extract_nutrition_table(page: BeautifulSoup) -> pd.DataFrame:
- # Get all the html table within the page
- tables: List[Tag] = page.find_all("table")
- # Check that we have the product information table
- product_table_tag: Optional[Tag] = None
- for table in tables:
- if table["class"][0] == "product__info-table":
- product_table_tag = table
- if not product_table_tag:
- raise RuntimeError(f"No product information table found in html for URL: {url}")
- # The read html method returns a list of all the DataFrame it finds in the supplied
- # html, we have only
- product_table: pd.DataFrame = pd.read_html(str(product_table_tag))[0]
- # Sometimes the last row is just text about reference values so we check if that is
- # the case and drop that last row if so
- if product_table.tail(1)["Typical Values"].str.contains("Reference intake").any():
- product_table.drop(product_table.tail(1).index, inplace=True)
- return product_table
Add Comment
Please, Sign In to add comment