Untitled

# lecture 8, data management and processing

import numpy as np
import pandas as pd

# with simple JSON, this is straightforward
data = pd.read_json('testing.json')

average_value = data['value'].mean()

# no much difference with internet APIs
weather = pd.read_json('https://edu.frostbit.fi/api/weather/')


# just for comparison, doing something similar with plain Python
import json

json_file = open("testing.json", "r")
json_data = json_file.read()
data = json.loads(json_data)
json_file.close()

sum_values = 0

for row in data:
    print(row['value'])
    sum_values += row['value']


# NEW FILE

import numpy as np
import pandas as pd
import json

json_file = open("somedata_simple.json", "r")
response_raw = json_file.read()
json_file.close()

# the actual JSON data is in this variable
response = json.loads(response_raw)

# if we don't normalize
test_data = pd.read_json("somedata_simple.json")

normalized_data = pd.json_normalize(response)


complex_test = pd.read_json("somedata.json")

json_file = open("somedata.json", "r")
response_raw = json_file.read()
json_file.close()

# the actual JSON data is in this variable
response = json.loads(response_raw)

complex_normalized = pd.json_normalize(response)

# one way is to convert the JSON list into a string and remove extra characters
complex_normalized['activities'] = complex_normalized['activities'].astype(str)
complex_normalized['activities'] = complex_normalized['activities'].str.replace("[", "")
complex_normalized['activities'] = complex_normalized['activities'].str.replace("]", "")
complex_normalized['activities'] = complex_normalized['activities'].str.replace("'", "")

# if normalizing a list of objects that also have lists
# this has to be done. the more complex the actual data
# the more complex this line of code gets (the meta-field)
cities_new = pd.json_normalize(
    response,
    record_path  = ['activities'],
    meta=['id', 'name',
            ['contact', 'phone'],
            ['contact', 'email']],
            errors='ignore'
)

# NEW FILE, TESTING JSON_FLATTEN

import requests
import json
import pandas as pd
# using flatten_json to help us.... pip install flatten_json for install
from flatten_json import flatten

json_file = open("somedata.json", "r")
response_raw = json_file.read()
json_file.close()

# the actual JSON data is in this variable
response = json.loads(response_raw)

# this is for cases where there is a list of data
# if only one object, then its just => dict_flattened = flatten(response)
dict_flattened = (flatten(record, '.') for record in response)

events_df = pd.DataFrame(dict_flattened)

# use this if for some reason flatten_json doesn't drop this automatically
#events_df = events_df.drop('contact', axis=1)

# NEW FILE

import pandas as pd
import xml.etree.ElementTree as et

# create an ElementTree object of the file
xtree = et.parse("somedata_single.xml")
node = xtree.getroot()

# grab the values from the data-item in the XML
e_id = node.find("id").text
e_name = node.find("name").text
e_value = node.find("value").text

# data and columns for new dataframe
df_data = []
df_cols = ["id", "name", "value"]

# add actual data
row = {
       "id": e_id,
       "name": e_name,
       "value": e_value
       }

df_data.append(row)

# create DataFrame
df = pd.DataFrame(data = df_data, columns = df_cols)

# NEW FILE

import pandas as pd
import xml.etree.ElementTree as et

# same example as before, but with multiple items
xtree = et.parse("somedata.xml")
nodes = xtree.getroot()

df_cols = ["id", "name", "value"]
rows = []

# multiple items => for-loop to iterate through them
for node in nodes:
    e_id = node.find("id").text
    e_name = node.find("name").text
    e_value = node.find("value").text

    rows.append({"id": e_id, "name": e_name, "value": e_value})


out_df = pd.DataFrame(rows, columns = df_cols)

# NEW DATA, WEBSCRAPING WIKIPEDIA PAGE

import requests
from bs4 import BeautifulSoup

# download source code of web page and make a BeautifulSoup object
url = "https://en.wikipedia.org/wiki/Rovaniemi"
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')

# id is preferred because it's supposed be unique in webpage
cityname_by_id = soup.find('h1', id='firstHeading').text

# if using HTML/CSS class, remember to use underscore => class_
cityname_by_class = soup.find('h1', class_='firstHeading').text

# NEW FILE

import requests
from bs4 import BeautifulSoup

# download source code of web page and make a BeautifulSoup object
url = "https://countrymeters.info/en/World"
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')

# <div id="cp1">7,876,547,299</div>
world_population = soup.find('div', id='cp1').text
world_population = world_population.replace(",", "")
world_population = int(world_population)

comparison = 7876547664

difference = world_population - comparison

# NEW FILE, WEB SCRAPE TABLES

import pandas as pd

# let's try with this
url = "https://en.wikipedia.org/wiki/List_of_best-selling_video_games"

# if you use this , you might need to install lxml => pip install lxml
data = pd.read_html(url)

# index 1 because in data -dataframe, the actual data is located in index 1
actual_data = data[1]

print(actual_data.head())

# remove some useless columns
actual_data = actual_data.drop('Ref(s).', axis=1)
actual_data = actual_data.drop('Rank', axis=1)