Untitled

""" Methods for scraping product information from the tesco website.
You will need to install requests, beautifulsoup4, lxml and pandas libraries

$ pip install requests beautifulsoup4 lxml pandas
"""
from typing import List, Optional

import requests

import pandas as pd

from bs4 import BeautifulSoup
from bs4.element import Tag


def fetch_page(url: str) -> BeautifulSoup:

    response: requests.Response = requests.get(url)

    # Raise an exception if the response status is not 200
    response.raise_for_status()

    # Parse the html page
    page: BeautifulSoup = BeautifulSoup(response.text, "lxml")

    return page


def extract_catagories(page: BeautifulSoup) -> List[str]:

    return list(page.find("ol").strings)


def extract_net_weight(page: BeautifulSoup) -> str:

    return page.find(id="net-contents").p.text.split()[0]


def extract_nutrition_table(page: BeautifulSoup) -> pd.DataFrame:

    # Get all the html table within the page
    tables: List[Tag] = page.find_all("table")

    # Check that we have the product information table
    product_table_tag: Optional[Tag] = None
    for table in tables:
        if table["class"][0] == "product__info-table":
            product_table_tag = table

    if not product_table_tag:
        raise RuntimeError(f"No product information table found in html for URL: {url}")

    # The read html method returns a list of all the DataFrame it finds in the supplied
    # html, we have only
    product_table: pd.DataFrame = pd.read_html(str(product_table_tag))[0]

    # Sometimes the last row is just text about reference values so we check if that is
    # the case and drop that last row if so
    if product_table.tail(1)["Typical Values"].str.contains("Reference intake").any():
        product_table.drop(product_table.tail(1).index, inplace=True)

    return product_table