Guest User

Untitled

a guest
Jan 20th, 2019
112
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.80 KB | None | 0 0
  1. """ Methods for scraping product information from the tesco website.
  2. You will need to install requests, beautifulsoup4, lxml and pandas libraries
  3.  
  4. $ pip install requests beautifulsoup4 lxml pandas
  5. """
  6. from typing import List, Optional
  7.  
  8. import requests
  9.  
  10. import pandas as pd
  11.  
  12. from bs4 import BeautifulSoup
  13. from bs4.element import Tag
  14.  
  15.  
  16. def fetch_page(url: str) -> BeautifulSoup:
  17.  
  18. response: requests.Response = requests.get(url)
  19.  
  20. # Raise an exception if the response status is not 200
  21. response.raise_for_status()
  22.  
  23. # Parse the html page
  24. page: BeautifulSoup = BeautifulSoup(response.text, "lxml")
  25.  
  26. return page
  27.  
  28.  
  29. def extract_catagories(page: BeautifulSoup) -> List[str]:
  30.  
  31. return list(page.find("ol").strings)
  32.  
  33.  
  34. def extract_net_weight(page: BeautifulSoup) -> str:
  35.  
  36. return page.find(id="net-contents").p.text.split()[0]
  37.  
  38.  
  39. def extract_nutrition_table(page: BeautifulSoup) -> pd.DataFrame:
  40.  
  41. # Get all the html table within the page
  42. tables: List[Tag] = page.find_all("table")
  43.  
  44. # Check that we have the product information table
  45. product_table_tag: Optional[Tag] = None
  46. for table in tables:
  47. if table["class"][0] == "product__info-table":
  48. product_table_tag = table
  49.  
  50. if not product_table_tag:
  51. raise RuntimeError(f"No product information table found in html for URL: {url}")
  52.  
  53. # The read html method returns a list of all the DataFrame it finds in the supplied
  54. # html, we have only
  55. product_table: pd.DataFrame = pd.read_html(str(product_table_tag))[0]
  56.  
  57. # Sometimes the last row is just text about reference values so we check if that is
  58. # the case and drop that last row if so
  59. if product_table.tail(1)["Typical Values"].str.contains("Reference intake").any():
  60. product_table.drop(product_table.tail(1).index, inplace=True)
  61.  
  62. return product_table
Add Comment
Please, Sign In to add comment