Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # lecture 8, data management and processing
- import numpy as np
- import pandas as pd
- # with simple JSON, this is straightforward
- data = pd.read_json('testing.json')
- average_value = data['value'].mean()
- # no much difference with internet APIs
- weather = pd.read_json('https://edu.frostbit.fi/api/weather/')
- # just for comparison, doing something similar with plain Python
- import json
- json_file = open("testing.json", "r")
- json_data = json_file.read()
- data = json.loads(json_data)
- json_file.close()
- sum_values = 0
- for row in data:
- print(row['value'])
- sum_values += row['value']
- # NEW FILE
- import numpy as np
- import pandas as pd
- import json
- json_file = open("somedata_simple.json", "r")
- response_raw = json_file.read()
- json_file.close()
- # the actual JSON data is in this variable
- response = json.loads(response_raw)
- # if we don't normalize
- test_data = pd.read_json("somedata_simple.json")
- normalized_data = pd.json_normalize(response)
- complex_test = pd.read_json("somedata.json")
- json_file = open("somedata.json", "r")
- response_raw = json_file.read()
- json_file.close()
- # the actual JSON data is in this variable
- response = json.loads(response_raw)
- complex_normalized = pd.json_normalize(response)
- # one way is to convert the JSON list into a string and remove extra characters
- complex_normalized['activities'] = complex_normalized['activities'].astype(str)
- complex_normalized['activities'] = complex_normalized['activities'].str.replace("[", "")
- complex_normalized['activities'] = complex_normalized['activities'].str.replace("]", "")
- complex_normalized['activities'] = complex_normalized['activities'].str.replace("'", "")
- # if normalizing a list of objects that also have lists
- # this has to be done. the more complex the actual data
- # the more complex this line of code gets (the meta-field)
- cities_new = pd.json_normalize(
- response,
- record_path = ['activities'],
- meta=['id', 'name',
- ['contact', 'phone'],
- ['contact', 'email']],
- errors='ignore'
- )
- # NEW FILE, TESTING JSON_FLATTEN
- import requests
- import json
- import pandas as pd
- # using flatten_json to help us.... pip install flatten_json for install
- from flatten_json import flatten
- json_file = open("somedata.json", "r")
- response_raw = json_file.read()
- json_file.close()
- # the actual JSON data is in this variable
- response = json.loads(response_raw)
- # this is for cases where there is a list of data
- # if only one object, then its just => dict_flattened = flatten(response)
- dict_flattened = (flatten(record, '.') for record in response)
- events_df = pd.DataFrame(dict_flattened)
- # use this if for some reason flatten_json doesn't drop this automatically
- #events_df = events_df.drop('contact', axis=1)
- # NEW FILE
- import pandas as pd
- import xml.etree.ElementTree as et
- # create an ElementTree object of the file
- xtree = et.parse("somedata_single.xml")
- node = xtree.getroot()
- # grab the values from the data-item in the XML
- e_id = node.find("id").text
- e_name = node.find("name").text
- e_value = node.find("value").text
- # data and columns for new dataframe
- df_data = []
- df_cols = ["id", "name", "value"]
- # add actual data
- row = {
- "id": e_id,
- "name": e_name,
- "value": e_value
- }
- df_data.append(row)
- # create DataFrame
- df = pd.DataFrame(data = df_data, columns = df_cols)
- # NEW FILE
- import pandas as pd
- import xml.etree.ElementTree as et
- # same example as before, but with multiple items
- xtree = et.parse("somedata.xml")
- nodes = xtree.getroot()
- df_cols = ["id", "name", "value"]
- rows = []
- # multiple items => for-loop to iterate through them
- for node in nodes:
- e_id = node.find("id").text
- e_name = node.find("name").text
- e_value = node.find("value").text
- rows.append({"id": e_id, "name": e_name, "value": e_value})
- out_df = pd.DataFrame(rows, columns = df_cols)
- # NEW DATA, WEBSCRAPING WIKIPEDIA PAGE
- import requests
- from bs4 import BeautifulSoup
- # download source code of web page and make a BeautifulSoup object
- url = "https://en.wikipedia.org/wiki/Rovaniemi"
- page = requests.get(url)
- soup = BeautifulSoup(page.content, 'html.parser')
- # id is preferred because it's supposed be unique in webpage
- cityname_by_id = soup.find('h1', id='firstHeading').text
- # if using HTML/CSS class, remember to use underscore => class_
- cityname_by_class = soup.find('h1', class_='firstHeading').text
- # NEW FILE
- import requests
- from bs4 import BeautifulSoup
- # download source code of web page and make a BeautifulSoup object
- url = "https://countrymeters.info/en/World"
- page = requests.get(url)
- soup = BeautifulSoup(page.content, 'html.parser')
- # <div id="cp1">7,876,547,299</div>
- world_population = soup.find('div', id='cp1').text
- world_population = world_population.replace(",", "")
- world_population = int(world_population)
- comparison = 7876547664
- difference = world_population - comparison
- # NEW FILE, WEB SCRAPE TABLES
- import pandas as pd
- # let's try with this
- url = "https://en.wikipedia.org/wiki/List_of_best-selling_video_games"
- # if you use this , you might need to install lxml => pip install lxml
- data = pd.read_html(url)
- # index 1 because in data -dataframe, the actual data is located in index 1
- actual_data = data[1]
- print(actual_data.head())
- # remove some useless columns
- actual_data = actual_data.drop('Ref(s).', axis=1)
- actual_data = actual_data.drop('Rank', axis=1)
Add Comment
Please, Sign In to add comment