Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import glob
- import re
- import numpy as np
- import pandas as pd
- import matplotlib.pyplot as plt
- # path to directory where file is
- path = 'C:\Personal\Data Projects\proton_ai\\'
- # path to directory where to store clean datasets
- clean_path = 'C:\Personal\Data Projects\proton_ai\clean\\'
- # format of files to read
- file_format = 'amazon_co-ecommerce_*.csv'
- # create list with all files in specified directory
- files_list = glob.glob(path + file_format)
- # for loop to create the reports and save the clean datasets in separate files
- for filename in files_list:
- # load data
- df = pd.read_csv(filename, thousands=',')
- # store regular expresison pattern that uniq_id should have
- id_pattern = re.compile('^\w{32}$')
- # empty list to append bool values of id_verifier
- id_verifier_list = []
- # for loop to verify uniq_id
- for u_id in range(len(df.uniq_id)):
- id_verifier = bool(id_pattern.match(df.uniq_id[u_id]))
- id_verifier_list.append(id_verifier)
- # drop the records with wrong uniq_id regex pattern
- df_clean = df.loc[id_verifier_list]
- # drop columns without data
- df_clean.dropna(axis=1, how='all', inplace=True)
- # converting numeric columns from type object to numeric
- df_clean.loc[:,'number_of_reviews'] = pd.to_numeric(df_clean.loc[:,'number_of_reviews'])
- df_clean.loc[:,'number_of_answered_questions'] = pd.to_numeric(df_clean.loc[:,'number_of_answered_questions'])
- # pattern to split number_available_in_stock column
- stock_pattern = re.compile('([0-9]+)(\xa0)([a-zA-Z]+)')
- # get the notnulls from number_available_in_stock column
- stock_series = df_clean.loc[df_clean['number_available_in_stock']\
- .notnull(),'number_available_in_stock']
- stock_split_data = [stock_pattern.match(string).\
- groups() for string in stock_series]
- # create df with number in one column and type in other column
- df_stock = pd.DataFrame(stock_split_data,
- index=stock_series.index,
- columns=['number_available_in_stock',
- 'space',
- 'type_available_in_stock'])
- df_stock.drop('space',axis=1,inplace=True)
- # add df_stock created above to df_clean
- df_clean['number_available_in_stock'] = pd.to_numeric(df_stock['number_available_in_stock'])
- df_clean['type_available_in_stock'] = df_stock['type_available_in_stock']
- # reset index
- df_clean.reset_index(drop=True, inplace=True)
- # Save clean DataFrame to csv files
- # extract the origial name of the file
- clean_file_name = re.search('.*\\\(.*.csv)$', filename)
- # add prefix `'clean_'` and save
- df_clean.to_csv(clean_path + 'clean_' + clean_file_name.group(1))
- # show info on how many not null values for each column
- # as well as the data type of each column
- print('Column data types and number of not-null entries: ',
- df_clean.info())
- # name of the uniq_id removed from data
- wrong_id = [not i for i in id_verifier_list]
- print('`uniq_id` names for records removed: ',
- df.uniq_id[wrong_id])
- # statistical exploratory analysis on numeric columns
- print('statistics on numeric columns',
- df_clean.describe())
- # Create a histogram of life_expectancy
- df_clean.number_available_in_stock.plot(kind='hist',
- title='Available in stock',
- bins=20)
- # Add second subplot
- plt.axes([0.5, 0.45, 0.35, 0.35])
- # zoom plot for number_available_in_stock > 35
- df_clean.number_available_in_stock.plot(kind='hist',
- title='Available in stock > 35',
- bins=20,
- ylim=[0,100],
- xlim=[35,80])
- # histograms for number available in stock by type of stock
- df_clean['number_available_in_stock'].\
- hist(by=df_clean['type_available_in_stock'])
- plt.show()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement