Proton.ai application

import glob
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# path to directory where file is
path = 'C:\Personal\Data Projects\proton_ai\\'
# path to directory where to store clean datasets
clean_path = 'C:\Personal\Data Projects\proton_ai\clean\\'

# format of files to read
file_format = 'amazon_co-ecommerce_*.csv'

# create list with all files in specified directory
files_list = glob.glob(path + file_format)

# for loop to create the reports and save the clean datasets in separate files
for filename in files_list:

    # load data
    df = pd.read_csv(filename, thousands=',')

    # store regular expresison pattern that uniq_id should have
    id_pattern = re.compile('^\w{32}$')

    # empty list to append bool values of id_verifier
    id_verifier_list = []

    # for loop to verify uniq_id
    for u_id in range(len(df.uniq_id)):
        id_verifier = bool(id_pattern.match(df.uniq_id[u_id]))
        id_verifier_list.append(id_verifier)

    # drop the records with wrong uniq_id regex pattern
    df_clean = df.loc[id_verifier_list]

    # drop columns without data
    df_clean.dropna(axis=1, how='all', inplace=True)

    # converting numeric columns from type object to numeric
    df_clean.loc[:,'number_of_reviews'] = pd.to_numeric(df_clean.loc[:,'number_of_reviews'])
    df_clean.loc[:,'number_of_answered_questions'] = pd.to_numeric(df_clean.loc[:,'number_of_answered_questions'])

    # pattern to split number_available_in_stock column
    stock_pattern = re.compile('([0-9]+)(\xa0)([a-zA-Z]+)')

    # get the notnulls from number_available_in_stock column
    stock_series = df_clean.loc[df_clean['number_available_in_stock']\
                                .notnull(),'number_available_in_stock']

    stock_split_data = [stock_pattern.match(string).\
                        groups() for string in stock_series]

    # create df with number in one column and type in other column
    df_stock = pd.DataFrame(stock_split_data,
                            index=stock_series.index,
                            columns=['number_available_in_stock',
                                     'space',
                                     'type_available_in_stock'])
    df_stock.drop('space',axis=1,inplace=True)

    # add df_stock created above to df_clean
    df_clean['number_available_in_stock'] = pd.to_numeric(df_stock['number_available_in_stock'])
    df_clean['type_available_in_stock'] = df_stock['type_available_in_stock']

    # reset index
    df_clean.reset_index(drop=True, inplace=True)


    # Save clean DataFrame to csv files
    # extract the origial name of the file
    clean_file_name = re.search('.*\\\(.*.csv)$', filename)
    # add prefix `'clean_'` and save
    df_clean.to_csv(clean_path + 'clean_' + clean_file_name.group(1))

# show info on how many not null values for each column
# as well as the data type of each column
print('Column data types and number of not-null entries: ',
      df_clean.info())

# name of the uniq_id removed from data
wrong_id = [not i for i in id_verifier_list]
print('`uniq_id` names for records removed: ',
      df.uniq_id[wrong_id])

# statistical exploratory analysis on numeric columns
print('statistics on numeric columns',
      df_clean.describe())

# Create a histogram of life_expectancy
df_clean.number_available_in_stock.plot(kind='hist',
                                        title='Available in stock',
                                        bins=20)

# Add second subplot
plt.axes([0.5, 0.45, 0.35, 0.35])

# zoom plot for number_available_in_stock > 35
df_clean.number_available_in_stock.plot(kind='hist',
                                        title='Available in stock > 35',
                                        bins=20,
                                        ylim=[0,100],
                                        xlim=[35,80])

# histograms for number available in stock by type of stock
df_clean['number_available_in_stock'].\
hist(by=df_clean['type_available_in_stock'])

plt.show()