Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- library(data.table)
- library(ggplot2)
- library(plyr)
- library(ez)
- library(corrgram)
- mental <- as.data.table(read.csv("/home/georgiy/MATLAB/help/survey.csv", header = T, sep = ",", dec = ".", fill = T, stringsAsFactors = F))
- View(mental)
- #We will try to visualise how such parameters as gender, age, nationality and family history of mental illnesses affects the possibility of
- #having a mental illness in people working for tech companies
- #Clean and prepare data
- #Create new data.table with parameters we plan to evaluate
- mental1 <- data.table(Age = mental$Age, Gender = mental$Gender, Country = mental$Country, Family_history = mental$family_history, Treatment = mental$treatment, Tech_company = mental$tech_company, keep.rownames = T)
- ##Let's visualise each parameter
- ##Age
- summary(mental1$Age)
- barplot(table(mental1$Age), main = "Age distribution", xlab = "Age")
- #We can clearly see that we have outlying values
- #It is logical to clean the Age column and get rid of values below 16 and over 90, as nobody would employ age categories beyond these boundaries
- outlying_age <- subset(mental1, Age<16 | Age>90)
- nrow(outlying_age)
- outlying_age
- mental1[mental1$Age>90]$Age <- -1
- mental1[mental1$Age<16]$Age <- NA
- mental <- mental1[!is.na(mental1$Age)]
- #check maximum and minimum values
- max(mental$Age, na.rm = T)
- min(mental$Age, na.rm = T)
- #statistics
- boxplot(mental$Age, col = "blue", main = "General age statistics", ylab = "Age")
- summary(mental$Age)
- #as we see, we have outliers on the boxplot, We should get rif of them in the data.table
- mental <- mental[mental$Age<=(mean(mental$Age, na.rm=T)+2*sd(mental$Age, na.rm=T))]
- boxplot(mental$Age, col = "blue", main = "General age statistics", ylab = "Age")
- ##Gender
- #Get rid of strange gender like "something kinda male" etc; assign gender numeric values
- summary(mental$Gender)
- mental$Gender <- as.numeric(revalue(mental$Gender, c("Male" = "2","male" = "2","M" = "2","m" = "2", "Female" = "1", "female" = "1", "F" = "1", "f" = 1)))
- mental <- mental[!is.na(mental$Gender)]
- #clean country column and choose US, UK & Canada
- #see which countries are presented
- a <- setorder(mental[,.N, by=Country],-N)
- a
- #choose the first 5 countries and create a vec out of them
- mental$Country %in% c('United States','United Kingdom','Canada')
- entries <- c('United States','United Kingdom','Canada')
- #mental[mental$Country != 'United States'| 'United Kingdom'| 'Canada']$Country <- NA
- #countryInd <- (mental$Country != as.vector(c('United States','United Kingdom','Canada','Germany','Ireland')))
- countryInd <- mental$Country %in% as.vector(c('United States','United Kingdom','Canada','Germany','Ireland'))
- mental <- mental[countryInd]
- #family history
- mental$Family_history[mental$Family_history == 'Yes'] <- 1
- mental$Family_history[mental$Family_history == 'No'] <- 0
- #treatment
- mental$Treatment[mental$Treatment == 'Yes'] <- 1
- mental$Treatment[mental$Treatment == 'No'] <- 0
- #tech company or not. Leave only tech company workers
- mental$Tech_company[mental$Tech_company == 'No'] <- NA
- mental$Tech_company[mental$Tech_company == 'Yes'] <- 1
- mental <- mental[!is.na(mental$Tech_company)]
- count(mental$Family_history)
- count(mental$Treatment)
- gencount <- count(mental$Gender)
- famcount <- count(mental$Family_history)
- treatedGender <- count(mental[mental$Treatment %in% 1]$Gender)
- nonTreatedGender <- count(mental[mental$Treatment %in% 0]$Gender)
- treatedGender$freq <- treatedGender$freq/gencount$freq
- treatedFam <- count(mental[mental$Treatment %in% 1]$Family_history)
- nonTreatedFam <- count(mental[mental$Treatment %in% 0]$Family_history)
- treatedFam$freq <- treatedFam$freq/famcount$freq
- treatedCountry <- count(mental[mental$Treatment %in% 1]$Country)
- countryCount <- count(mental$Country)
- treatedCountry$freq <- treatedCountry$freq/countryCount$freq
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement