Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # read file
- originaldata = pd.read_csv("./amazon_co-ecommerce_sample.csv")
- print(originaldata.shape)
- originaldata
- #importing data analysis packages
- import numpy as np
- import pandas as pd
- import random as rnd
- #importing data visualization packages
- import seaborn as sns
- import matplotlib.pyplot as plt
- %matplotlib inline
- originaldata.dropna(how='all', axis='columns')
- #getting rid of unnamed columns with Nan
- originaldata.columns
- originaldata.columns.str.match('Unnamed')
- originaldata.loc[:, ~originaldata.columns.str.match('Unnamed')]
- originaldata.isnull().sum()
- originaldata = pd.read_csv('amazon_co-ecommerce_sample.csv', index_col=0)
- originaldata.drop(originaldata.columns[originaldata.columns.str.contains('unnamed',case = False)],axis = 1, inplace = True)
- originaldata.dropna(how='all', axis='columns')
- #imputation and interpolation
- originaldata.isnull().sum()
- from sklearn.preprocessing import Imputer
- imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
- imp.fit(originaldata)
- originaldata= imp.transform(originaldata)
- nullFeatures = [feature for feature, numNull in originaldata.isnull().sum().iteritems() if numNull > 0]
- nullFeatures
- for n in nullFeatures[2:]:
- originaldata[n] = originaldata[n].interpolate()
- originaldata.isnull().sum()
- pp.ProfileReport(originaldata)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement