Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- a=c(1,2,3,NA,5,6,NA)
- positions=which(is.na(a))
- a=[1,2,3,np.nan,5,6,np.nan]
- positions=pd.isnull(a)
- In [307]:
- a=[1,2,3,np.nan,5,6,np.nan]
- np.nonzero(pd.isnull(a))
- Out[307]:
- (array([3, 6], dtype=int64),)
- indexes = [index for index,element in enumerate(a) if np.isnan(element)]
- def Kickstarter_Example_94():
- print()
- print(format('How to deal with missing values in a Pandas DataFrame','*^82'))
- import warnings
- warnings.filterwarnings("ignore")
- # load libraries
- import pandas as pd
- import numpy as np
- # Create dataframe with missing values
- raw_data = {'first_name': ['Jason', np.nan, 'Tina', 'Jake', 'Amy'],
- 'last_name': ['Miller', np.nan, 'Ali', 'Milner', 'Cooze'],
- 'age': [42, np.nan, 36, 24, 73],
- 'sex': ['m', np.nan, 'f', 'm', 'f'],
- 'preTestScore': [4, np.nan, np.nan, 2, 3],
- 'postTestScore': [25, np.nan, np.nan, 62, 70]}
- df = pd.DataFrame(raw_data, columns = ['first_name', 'last_name', 'age', 'sex',
- 'preTestScore', 'postTestScore'])
- print(); print(df)
- # Drop missing observations
- df_no_missing = df.dropna()
- print(); print(df_no_missing)
- # Drop rows where all cells in that row is NA
- df_cleaned = df.dropna(how='all')
- print(); print(df_cleaned)
- # Create a new column full of missing values
- df['location'] = np.nan
- print(); print(df)
- # Drop column if they only contain missing values
- print(); print(df.dropna(axis=1, how='all'))
- # Drop rows that contain less than five observations
- # This is really mostly useful for time series
- print(); print(df.dropna(thresh=5))
- # Fill in missing data with zeros
- print(); print(df.fillna(0))
- # Fill in missing in preTestScore with the mean value of preTestScore
- # inplace=True means that the changes are saved to the df right away
- df["preTestScore"].fillna(df["preTestScore"].mean(), inplace=True)
- print(); print(df)
- # Fill in missing in postTestScore with each sex’s mean value of postTestScore
- df["postTestScore"].fillna(df.groupby("sex")["postTestScore"].transform("mean"), inplace=True)
- print(); print(df)
- # Select the rows of df where age is not NaN and sex is not NaN
- print(); print(df[df['age'].notnull() & df['sex'].notnull()])
- print(); print(df[df['age'].notnull() & df['sex'].notnull()].fillna(0))
- Kickstarter_Example_94()
Add Comment
Please, Sign In to add comment