Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import pandas
- df = pandas.read_csv("census_income_dataset.csv")
- #print(df.isnull().sum())
- df['occupation'].replace({'?' : 'unknown'}, inplace = True)
- df['workclass'].replace({'?' : 'unknown'}, inplace = True)
- df['native_country'].replace({'?' : 'unknown'}, inplace = True)
- #df['occupation'].replace({'?' : 'unknown'}, inplace = True)
- import matplotlib.pyplot as plot
- import numpy as np
- df.groupby('sex').size().plot(kind = 'pie', autopct = '%1.1f%%',explode = (0,0.1))
- plot.title('Gender distribution')
- plot.show()
- figure, ax = plot.subplots()
- # alpha makes the bars abit transparent so that you can see the other bars
- ax.hist(df['age'], color='red', label='Age frequency of respondents',alpha = 0.50)
- ax.set_title('Histogram showing the frequency of ages')
- ax.set_xlabel('Ages')
- ax.set_ylabel('Frequency')
- plot.show()
- df.groupby('occupation').size().plot(kind = 'pie', autopct = '%1.1f%%')
- plot.title('Occupations')
- plot.show()
- df.groupby('race').size().plot(kind = 'pie', autopct = '%1.1f%%')
- plot.show('Race')
- z = df.groupby(['occupation'])['hours_per_week'].mean()
- z.plot(kind = 'bar', color = 'purple')
- plot.title('Hours worked per week by different occupations')
- plot.xlabel('Occupation')
- plot.ylabel('Hours per week')
- plot.show()
- df.groupby(['occupation','sex'])['hours_per_week'].mean().unstack().plot(kind = 'bar', stacked = True) #if we put false, it would be unstacked
- plot.title('Occupation and sex of respondents by hours worked per week')
- plot.xlabel('Occupation')
- plot.ylabel('Hours per week')
- plot.show()
- #df.groupby('native_country').size().plot(kind = 'pie', autopct = '%1.1f%%')
- #plot.title('Countries')
- #plot.show()
- #df.groupby(['native_country','sex'])['hours_per_week'].mean().unstack().plot(kind = 'bar', stacked = True) #if we put false, it would be unstacked
- df.groupby(['education','sex'])['hours_per_week'].mean().unstack().plot(kind = 'bar', stacked = True) #if we put false, it would be unstacked
- plot.title('Education level of people of different genders by hours per week')
- plot.xlabel('Education')
- plot.ylabel('hours per week')
- plot.show()
- import seaborn
- df_x = df[['age','education','marital_status','occupation','sex','hours_per_week','income_level']]
- figure,ax = plot.subplots()
- seaborn.heatmap(df_x.corr(), cmap = 'Reds', annot = True)
- # annot means annotation, c
- plot.show()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement