tuomasvaltanen

Untitled

Apr 6th, 2021 (edited)
605
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 5.47 KB | None | 0 0
  1. # lecture 8, data management and processing
  2.  
  3. import numpy as np
  4. import pandas as pd
  5.  
  6. # with simple JSON, this is straightforward
  7. data = pd.read_json('testing.json')
  8.  
  9. average_value = data['value'].mean()
  10.  
  11. # no much difference with internet APIs
  12. weather = pd.read_json('https://edu.frostbit.fi/api/weather/')
  13.  
  14.  
  15. # just for comparison, doing something similar with plain Python
  16. import json
  17.  
  18. json_file = open("testing.json", "r")
  19. json_data = json_file.read()
  20. data = json.loads(json_data)
  21. json_file.close()
  22.  
  23. sum_values = 0
  24.  
  25. for row in data:
  26.     print(row['value'])
  27.     sum_values += row['value']
  28.    
  29.    
  30. # NEW FILE
  31.  
  32. import numpy as np
  33. import pandas as pd
  34. import json
  35.  
  36. json_file = open("somedata_simple.json", "r")
  37. response_raw = json_file.read()
  38. json_file.close()
  39.  
  40. # the actual JSON data is in this variable
  41. response = json.loads(response_raw)
  42.  
  43. # if we don't normalize
  44. test_data = pd.read_json("somedata_simple.json")
  45.  
  46. normalized_data = pd.json_normalize(response)
  47.  
  48.  
  49. complex_test = pd.read_json("somedata.json")
  50.  
  51. json_file = open("somedata.json", "r")
  52. response_raw = json_file.read()
  53. json_file.close()
  54.  
  55. # the actual JSON data is in this variable
  56. response = json.loads(response_raw)
  57.  
  58. complex_normalized = pd.json_normalize(response)
  59.  
  60. # one way is to convert the JSON list into a string and remove extra characters
  61. complex_normalized['activities'] = complex_normalized['activities'].astype(str)
  62. complex_normalized['activities'] = complex_normalized['activities'].str.replace("[", "")
  63. complex_normalized['activities'] = complex_normalized['activities'].str.replace("]", "")
  64. complex_normalized['activities'] = complex_normalized['activities'].str.replace("'", "")
  65.  
  66. # if normalizing a list of objects that also have lists
  67. # this has to be done. the more complex the actual data
  68. # the more complex this line of code gets (the meta-field)
  69. cities_new = pd.json_normalize(
  70.     response,
  71.     record_path  = ['activities'],
  72.     meta=['id', 'name',
  73.             ['contact', 'phone'],
  74.             ['contact', 'email']],
  75.             errors='ignore'
  76. )
  77.  
  78. # NEW FILE, TESTING JSON_FLATTEN
  79.  
  80. import requests
  81. import json
  82. import pandas as pd
  83. # using flatten_json to help us.... pip install flatten_json for install
  84. from flatten_json import flatten
  85.  
  86. json_file = open("somedata.json", "r")
  87. response_raw = json_file.read()
  88. json_file.close()
  89.  
  90. # the actual JSON data is in this variable
  91. response = json.loads(response_raw)
  92.  
  93. # this is for cases where there is a list of data
  94. # if only one object, then its just => dict_flattened = flatten(response)
  95. dict_flattened = (flatten(record, '.') for record in response)
  96.  
  97. events_df = pd.DataFrame(dict_flattened)
  98.  
  99. # use this if for some reason flatten_json doesn't drop this automatically
  100. #events_df = events_df.drop('contact', axis=1)
  101.  
  102. # NEW FILE
  103.  
  104. import pandas as pd
  105. import xml.etree.ElementTree as et
  106.  
  107. # create an ElementTree object of the file
  108. xtree = et.parse("somedata_single.xml")
  109. node = xtree.getroot()
  110.  
  111. # grab the values from the data-item in the XML
  112. e_id = node.find("id").text
  113. e_name = node.find("name").text
  114. e_value = node.find("value").text
  115.  
  116. # data and columns for new dataframe
  117. df_data = []
  118. df_cols = ["id", "name", "value"]
  119.  
  120. # add actual data
  121. row = {
  122.        "id": e_id,
  123.        "name": e_name,
  124.        "value": e_value
  125.        }
  126.  
  127. df_data.append(row)
  128.  
  129. # create DataFrame
  130. df = pd.DataFrame(data = df_data, columns = df_cols)
  131.  
  132. # NEW FILE
  133.  
  134. import pandas as pd
  135. import xml.etree.ElementTree as et
  136.  
  137. # same example as before, but with multiple items
  138. xtree = et.parse("somedata.xml")
  139. nodes = xtree.getroot()
  140.  
  141. df_cols = ["id", "name", "value"]
  142. rows = []
  143.  
  144. # multiple items => for-loop to iterate through them
  145. for node in nodes:
  146.     e_id = node.find("id").text
  147.     e_name = node.find("name").text
  148.     e_value = node.find("value").text
  149.    
  150.     rows.append({"id": e_id, "name": e_name, "value": e_value})
  151.    
  152.    
  153. out_df = pd.DataFrame(rows, columns = df_cols)
  154.  
  155. # NEW DATA, WEBSCRAPING WIKIPEDIA PAGE
  156.  
  157. import requests
  158. from bs4 import BeautifulSoup
  159.  
  160. # download source code of web page and make a BeautifulSoup object
  161. url = "https://en.wikipedia.org/wiki/Rovaniemi"
  162. page = requests.get(url)
  163. soup = BeautifulSoup(page.content, 'html.parser')
  164.  
  165. # id is preferred because it's supposed be unique in webpage
  166. cityname_by_id = soup.find('h1', id='firstHeading').text
  167.  
  168. # if using HTML/CSS class, remember to use underscore => class_
  169. cityname_by_class = soup.find('h1', class_='firstHeading').text
  170.  
  171. # NEW FILE
  172.  
  173. import requests
  174. from bs4 import BeautifulSoup
  175.  
  176. # download source code of web page and make a BeautifulSoup object
  177. url = "https://countrymeters.info/en/World"
  178. page = requests.get(url)
  179. soup = BeautifulSoup(page.content, 'html.parser')
  180.  
  181. # <div id="cp1">7,876,547,299</div>
  182. world_population = soup.find('div', id='cp1').text
  183. world_population = world_population.replace(",", "")
  184. world_population = int(world_population)
  185.  
  186. comparison = 7876547664
  187.  
  188. difference = world_population - comparison
  189.  
  190. # NEW FILE, WEB SCRAPE TABLES
  191.  
  192. import pandas as pd
  193.  
  194. # let's try with this
  195. url = "https://en.wikipedia.org/wiki/List_of_best-selling_video_games"
  196.  
  197. # if you use this , you might need to install lxml => pip install lxml
  198. data = pd.read_html(url)
  199.  
  200. # index 1 because in data -dataframe, the actual data is located in index 1
  201. actual_data = data[1]
  202.  
  203. print(actual_data.head())
  204.  
  205. # remove some useless columns
  206. actual_data = actual_data.drop('Ref(s).', axis=1)
  207. actual_data = actual_data.drop('Rank', axis=1)
Add Comment
Please, Sign In to add comment