Advertisement
partsunev

03_extract_countries_population

Sep 15th, 2024
268
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.50 KB | None | 0 0
  1. import os
  2. import json
  3. import requests
  4. import pandas as pd
  5.  
  6. from bs4 import BeautifulSoup
  7. from io import StringIO
  8.  
  9. URL = "https://en.wikipedia.org/wiki/List_of_European_Union_member_states_by_population"
  10. DATA_FILE = "eu_population_data.json"
  11.  
  12. def get_soup(url):
  13.     try:
  14.         response = requests.get(url)
  15.         response.raise_for_status()
  16.         soup_data = BeautifulSoup(response.text, "lxml")
  17.         return soup_data
  18.     except requests.exceptions.RequestException as e:
  19.         print(f"Error fetching data from {url}: {e}")
  20.         return None
  21.  
  22.  
  23. def countries_population(df):
  24.     country_dict = {}
  25.  
  26.     country_column = "Country"
  27.     population_column = [col for col in df.columns if 'Population' in col][0]
  28.     df[population_column] = df[population_column].astype(str).str.replace(',', '').astype(int)
  29.  
  30.     for _, row in df.iterrows():
  31.         country = row[country_column]
  32.         population = row[population_column]
  33.  
  34.         country_dict[country] = {"country_population": population}
  35.  
  36.     return country_dict
  37.  
  38.  
  39. def population_percentage(data):
  40.     total_population = sum([pop["country_population"] for pop in data.values()])
  41.  
  42.     for country_data in data.values():
  43.         percentage = (country_data["country_population"] / total_population) * 100
  44.         country_data["country_population_percentage"] = round(percentage, 1)
  45.  
  46.     return data
  47.  
  48.  
  49. def load_saved_data(file_path):
  50.     if os.path.exists(file_path):
  51.         with open(file_path, "r") as file:
  52.             return json.load(file)
  53.     return None
  54.  
  55.  
  56. def save_data_to_file(data, file_path):
  57.     with open(file_path, "w") as file:
  58.         json.dump(data, file, indent=4)
  59.  
  60.  
  61. def is_data_updated(old_data, new_data):
  62.     return old_data != new_data
  63.  
  64.  
  65. soup = get_soup(URL)
  66.  
  67. if soup is not None:
  68.     table = soup.find("table")
  69.     if table is None:
  70.         raise ValueError("No table found on the webpage.")
  71.  
  72.     df = pd.read_html(StringIO(str(table)))[0]
  73.     df = df[df["Country"] != "European Union"]
  74.  
  75.     countries_population_dict = countries_population(df)
  76.     countries_population_percentage = population_percentage(countries_population_dict)
  77.  
  78.     saved_data = load_saved_data(DATA_FILE)
  79.  
  80.     if saved_data is None or is_data_updated(saved_data, countries_population_percentage):
  81.         print("New data found. Saving to file...")
  82.         save_data_to_file(countries_population_percentage, DATA_FILE)
  83.     else:
  84.         print("No new data. File not updated.")
  85. else:
  86.     print("Failed to retrieve webpage data.")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement