Untitled

import pandas

df = pandas.read_csv("census_income_dataset.csv")
#print(df.isnull().sum())
df['occupation'].replace({'?' : 'unknown'}, inplace = True)
df['workclass'].replace({'?' : 'unknown'}, inplace = True)
df['native_country'].replace({'?' : 'unknown'}, inplace = True)
#df['occupation'].replace({'?' : 'unknown'}, inplace = True)
import matplotlib.pyplot as plot
import numpy as np

df.groupby('sex').size().plot(kind = 'pie', autopct = '%1.1f%%',explode = (0,0.1))

plot.title('Gender distribution')
plot.show()

figure, ax = plot.subplots()
# alpha makes the bars abit transparent so that you can see the other bars
ax.hist(df['age'], color='red', label='Age frequency of respondents',alpha = 0.50)
ax.set_title('Histogram showing the frequency  of ages')
ax.set_xlabel('Ages')
ax.set_ylabel('Frequency')
plot.show()


df.groupby('occupation').size().plot(kind = 'pie', autopct = '%1.1f%%')

plot.title('Occupations')
plot.show()

df.groupby('race').size().plot(kind = 'pie', autopct = '%1.1f%%')
plot.show('Race')


z = df.groupby(['occupation'])['hours_per_week'].mean()
z.plot(kind = 'bar', color = 'purple')
plot.title('Hours worked per week by different occupations')
plot.xlabel('Occupation')
plot.ylabel('Hours per week')
plot.show()

df.groupby(['occupation','sex'])['hours_per_week'].mean().unstack().plot(kind = 'bar', stacked = True) #if we put false, it would be unstacked
plot.title('Occupation and sex of respondents by hours worked per week')
plot.xlabel('Occupation')
plot.ylabel('Hours per week')

plot.show()

#df.groupby('native_country').size().plot(kind = 'pie', autopct = '%1.1f%%')

#plot.title('Countries')
#plot.show()


#df.groupby(['native_country','sex'])['hours_per_week'].mean().unstack().plot(kind = 'bar', stacked = True) #if we put false, it would be unstacked

df.groupby(['education','sex'])['hours_per_week'].mean().unstack().plot(kind = 'bar', stacked = True) #if we put false, it would be unstacked
plot.title('Education level of people of different genders by hours per week')
plot.xlabel('Education')
plot.ylabel('hours per week')

plot.show()

import seaborn
df_x = df[['age','education','marital_status','occupation','sex','hours_per_week','income_level']]
figure,ax = plot.subplots()
seaborn.heatmap(df_x.corr(), cmap = 'Reds', annot = True)
# annot means annotation, c
plot.show()