SHARE
TWEET

Untitled

a guest Oct 10th, 2019 70 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. import pandas as pd
  2. import numpy as np
  3. import matplotlib
  4. import matplotlib.pyplot as plt
  5. import seaborn as sns
  6. import missingno
  7. import warnings
  8. warnings.filterwarnings("ignore")
  9. %matplotlib inline
  10.  
  11.        
  12. def time_series_plot(df):
  13.     """Given dataframe, generate times series plot of numeric data by daily, monthly and yearly frequency"""
  14.     print("\nTo check time series of numeric data  by daily, monthly and yearly frequency")
  15.     if len(df.select_dtypes(include='datetime64').columns)>0:
  16.         for col in df.select_dtypes(include='datetime64').columns:
  17.             for p in ['D', 'M', 'Y']:
  18.                 if p=='D':
  19.                     print("Plotting daily data")
  20.                 elif p=='M':
  21.                     print("Plotting monthly data")
  22.                 else:
  23.                     print("Plotting yearly data")
  24.                 for col_num in df.select_dtypes(include=np.number).columns:
  25.                     __ = df.copy()
  26.                     __ = __.set_index(col)
  27.                     __T = __.resample(p).sum()
  28.                     ax = __T[[col_num]].plot()
  29.                     ax.set_ylim(bottom=0)
  30.                     ax.get_yaxis().set_major_formatter(
  31.                     matplotlib.ticker.FuncFormatter(lambda x, p: format(int(x), ',')))
  32.                     plt.show()
  33.  
  34.                    
  35. def numeric_eda(df, hue=None):
  36.     """Given dataframe, generate EDA of numeric data"""
  37.     print("\nTo check: \nDistribution of numeric data")
  38.     display(df.describe().T)
  39.     columns = df.select_dtypes(include=np.number).columns
  40.     figure = plt.figure(figsize=(20, 10))
  41.     figure.add_subplot(1, len(columns), 1)
  42.     for index, col in enumerate(columns):
  43.         if index > 0:
  44.             figure.add_subplot(1, len(columns), index + 1)
  45.         sns.boxplot(y=col, data=df, boxprops={'facecolor': 'None'})
  46.     figure.tight_layout()
  47.     plt.show()
  48.    
  49.     if len(df.select_dtypes(include='category').columns) > 0:
  50.         for col_num in df.select_dtypes(include=np.number).columns:
  51.             for col in df.select_dtypes(include='category').columns:
  52.                 fig = sns.catplot(x=col, y=col_num, kind='violin', data=df, height=5, aspect=2)
  53.                 fig.set_xticklabels(rotation=90)
  54.                 plt.show()
  55.    
  56.     # Plot the pairwise joint distributions
  57.     print("\nTo check pairwise joint distribution of numeric data")
  58.     if hue==None:
  59.         sns.pairplot(df.select_dtypes(include=np.number))
  60.     else:
  61.         sns.pairplot(df.select_dtypes(include=np.number).join(df[[hue]]), hue=hue)
  62.     plt.show()
  63.  
  64.  
  65. def top5(df):
  66.     """Given dataframe, generate top 5 unique values for non-numeric data"""
  67.     columns = df.select_dtypes(include=['object', 'category']).columns
  68.     for col in columns:
  69.         print("Top 5 unique values of " + col)
  70.         print(df[col].value_counts().reset_index().rename(columns={"index": col, col: "Count"})[
  71.               :min(5, len(df[col].value_counts()))])
  72.         print(" ")
  73.    
  74.    
  75. def categorical_eda(df, hue=None):
  76.     """Given dataframe, generate EDA of categorical data"""
  77.     print("\nTo check: \nUnique count of non-numeric data\n")
  78.     print(df.select_dtypes(include=['object', 'category']).nunique())
  79.     top5(df)
  80.     # Plot count distribution of categorical data
  81.     for col in df.select_dtypes(include='category').columns:
  82.         fig = sns.catplot(x=col, kind="count", data=df, hue=hue)
  83.         fig.set_xticklabels(rotation=90)
  84.         plt.show()
  85.    
  86.  
  87. def eda(df):
  88.     """Given dataframe, generate exploratory data analysis"""
  89.     # check that input is pandas dataframe
  90.     if type(df) != pd.core.frame.DataFrame:
  91.         raise TypeError("Only pandas dataframe is allowed as input")
  92.        
  93.     # replace field that's entirely space (or empty) with NaN
  94.     df = df.replace(r'^\s*$', np.nan, regex=True)
  95.  
  96.     print("Preview of data:")
  97.     display(df.head(3))
  98.  
  99.     print("\nTo check: \n (1) Total number of entries \n (2) Column types \n (3) Any null values\n")
  100.     print(df.info())
  101.  
  102.     # generate preview of entries with null values
  103.     if len(df[df.isnull().any(axis=1)] != 0):
  104.         print("\nPreview of data with null values:")
  105.         display(df[df.isnull().any(axis=1)].head(3))
  106.         missingno.matrix(df)
  107.         plt.show()
  108.  
  109.     # generate count statistics of duplicate entries
  110.     if len(df[df.duplicated()]) > 0:
  111.         print("\n***Number of duplicated entries: ", len(df[df.duplicated()]))
  112.         display(df[df.duplicated(keep=False)].sort_values(by=list(df.columns)).head())
  113.     else:
  114.         print("\nNo duplicated entries found")
  115.  
  116.     # EDA of categorical data
  117.     categorical_eda(df)
  118.    
  119.     # EDA of numeric data
  120.     numeric_eda(df)
  121.        
  122.     # Plot time series plot of numeric data
  123.     time_series_plot(df)
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
 
Top