Advertisement
Guest User

Proton.ai application

a guest
Sep 15th, 2019
342
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 4.05 KB | None | 0 0
  1. import glob
  2. import re
  3. import numpy as np
  4. import pandas as pd
  5. import matplotlib.pyplot as plt
  6.  
  7. # path to directory where file is
  8. path = 'C:\Personal\Data Projects\proton_ai\\'
  9. # path to directory where to store clean datasets
  10. clean_path = 'C:\Personal\Data Projects\proton_ai\clean\\'
  11.  
  12. # format of files to read
  13. file_format = 'amazon_co-ecommerce_*.csv'
  14.  
  15. # create list with all files in specified directory
  16. files_list = glob.glob(path + file_format)
  17.  
  18. # for loop to create the reports and save the clean datasets in separate files
  19. for filename in files_list:
  20.  
  21.     # load data
  22.     df = pd.read_csv(filename, thousands=',')
  23.  
  24.     # store regular expresison pattern that uniq_id should have
  25.     id_pattern = re.compile('^\w{32}$')
  26.  
  27.     # empty list to append bool values of id_verifier
  28.     id_verifier_list = []
  29.  
  30.     # for loop to verify uniq_id
  31.     for u_id in range(len(df.uniq_id)):
  32.         id_verifier = bool(id_pattern.match(df.uniq_id[u_id]))
  33.         id_verifier_list.append(id_verifier)
  34.  
  35.     # drop the records with wrong uniq_id regex pattern
  36.     df_clean = df.loc[id_verifier_list]
  37.  
  38.     # drop columns without data
  39.     df_clean.dropna(axis=1, how='all', inplace=True)
  40.  
  41.     # converting numeric columns from type object to numeric
  42.     df_clean.loc[:,'number_of_reviews'] = pd.to_numeric(df_clean.loc[:,'number_of_reviews'])
  43.     df_clean.loc[:,'number_of_answered_questions'] = pd.to_numeric(df_clean.loc[:,'number_of_answered_questions'])
  44.  
  45.     # pattern to split number_available_in_stock column
  46.     stock_pattern = re.compile('([0-9]+)(\xa0)([a-zA-Z]+)')
  47.  
  48.     # get the notnulls from number_available_in_stock column
  49.     stock_series = df_clean.loc[df_clean['number_available_in_stock']\
  50.                                 .notnull(),'number_available_in_stock']
  51.  
  52.     stock_split_data = [stock_pattern.match(string).\
  53.                         groups() for string in stock_series]
  54.  
  55.     # create df with number in one column and type in other column
  56.     df_stock = pd.DataFrame(stock_split_data,
  57.                             index=stock_series.index,
  58.                             columns=['number_available_in_stock',
  59.                                      'space',
  60.                                      'type_available_in_stock'])
  61.     df_stock.drop('space',axis=1,inplace=True)
  62.  
  63.     # add df_stock created above to df_clean
  64.     df_clean['number_available_in_stock'] = pd.to_numeric(df_stock['number_available_in_stock'])
  65.     df_clean['type_available_in_stock'] = df_stock['type_available_in_stock']
  66.  
  67.     # reset index
  68.     df_clean.reset_index(drop=True, inplace=True)
  69.    
  70.    
  71.     # Save clean DataFrame to csv files
  72.     # extract the origial name of the file
  73.     clean_file_name = re.search('.*\\\(.*.csv)$', filename)
  74.     # add prefix `'clean_'` and save
  75.     df_clean.to_csv(clean_path + 'clean_' + clean_file_name.group(1))
  76.  
  77. # show info on how many not null values for each column
  78. # as well as the data type of each column
  79. print('Column data types and number of not-null entries: ',
  80.       df_clean.info())
  81.  
  82. # name of the uniq_id removed from data
  83. wrong_id = [not i for i in id_verifier_list]
  84. print('`uniq_id` names for records removed: ',
  85.       df.uniq_id[wrong_id])
  86.  
  87. # statistical exploratory analysis on numeric columns
  88. print('statistics on numeric columns',
  89.       df_clean.describe())
  90.  
  91. # Create a histogram of life_expectancy
  92. df_clean.number_available_in_stock.plot(kind='hist',
  93.                                         title='Available in stock',
  94.                                         bins=20)
  95.  
  96. # Add second subplot
  97. plt.axes([0.5, 0.45, 0.35, 0.35])
  98.  
  99. # zoom plot for number_available_in_stock > 35
  100. df_clean.number_available_in_stock.plot(kind='hist',
  101.                                         title='Available in stock > 35',
  102.                                         bins=20,
  103.                                         ylim=[0,100],
  104.                                         xlim=[35,80])
  105.  
  106. # histograms for number available in stock by type of stock
  107. df_clean['number_available_in_stock'].\
  108. hist(by=df_clean['type_available_in_stock'])
  109.  
  110. plt.show()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement