Untitled

library(data.table)
library(ggplot2)
library(plyr)
library(ez)
library(corrgram)

mental <- as.data.table(read.csv("/home/georgiy/MATLAB/help/survey.csv", header = T, sep = ",", dec = ".", fill = T, stringsAsFactors = F))
View(mental)
#We will try to visualise how such parameters as gender, age, nationality and family history of mental illnesses affects the possibility of
#having a mental illness in people working for tech companies

#Clean and prepare data
#Create new data.table with parameters we plan to evaluate
mental1 <- data.table(Age = mental$Age, Gender = mental$Gender, Country = mental$Country, Family_history = mental$family_history, Treatment = mental$treatment, Tech_company = mental$tech_company, keep.rownames = T)

##Let's visualise each parameter
##Age
summary(mental1$Age)
barplot(table(mental1$Age), main = "Age distribution", xlab = "Age")
#We can clearly see that we have outlying values
#It is logical to clean the Age column and get rid of values below 16 and over 90, as nobody would employ age categories beyond these boundaries
outlying_age <- subset(mental1, Age<16 | Age>90)
nrow(outlying_age)
outlying_age
mental1[mental1$Age>90]$Age <- -1
mental1[mental1$Age<16]$Age <- NA
mental <- mental1[!is.na(mental1$Age)]
#check maximum and minimum values
max(mental$Age, na.rm = T)
min(mental$Age, na.rm = T)
#statistics
boxplot(mental$Age, col = "blue", main = "General age statistics", ylab = "Age")
summary(mental$Age)
#as we see, we have outliers on the boxplot, We should get rif of them in the data.table
mental <- mental[mental$Age<=(mean(mental$Age, na.rm=T)+2*sd(mental$Age, na.rm=T))]
boxplot(mental$Age, col = "blue", main = "General age statistics", ylab = "Age")

##Gender
#Get rid of strange gender like "something kinda male" etc; assign gender numeric values
summary(mental$Gender)

mental$Gender <- as.numeric(revalue(mental$Gender, c("Male" = "2","male" = "2","M" = "2","m" = "2", "Female" = "1", "female" = "1", "F" = "1", "f" = 1)))
mental <- mental[!is.na(mental$Gender)]


#clean country column and choose US, UK & Canada
#see which countries are presented
a <- setorder(mental[,.N, by=Country],-N)
a
#choose the first 5 countries and create a vec out of them
mental$Country %in% c('United States','United Kingdom','Canada')
entries <-  c('United States','United Kingdom','Canada')
#mental[mental$Country != 'United States'| 'United Kingdom'| 'Canada']$Country <- NA
#countryInd <- (mental$Country != as.vector(c('United States','United Kingdom','Canada','Germany','Ireland')))
countryInd <- mental$Country %in% as.vector(c('United States','United Kingdom','Canada','Germany','Ireland'))
mental <- mental[countryInd]


#family history
mental$Family_history[mental$Family_history == 'Yes'] <- 1
mental$Family_history[mental$Family_history == 'No'] <- 0

#treatment
mental$Treatment[mental$Treatment == 'Yes'] <- 1
mental$Treatment[mental$Treatment == 'No'] <- 0

#tech company or not. Leave only tech company workers
mental$Tech_company[mental$Tech_company == 'No'] <- NA
mental$Tech_company[mental$Tech_company == 'Yes'] <- 1
mental <- mental[!is.na(mental$Tech_company)]

count(mental$Family_history)
count(mental$Treatment)
gencount <- count(mental$Gender)
famcount <- count(mental$Family_history)


treatedGender <- count(mental[mental$Treatment %in% 1]$Gender)
nonTreatedGender <- count(mental[mental$Treatment %in% 0]$Gender)

treatedGender$freq <- treatedGender$freq/gencount$freq

treatedFam <- count(mental[mental$Treatment %in% 1]$Family_history)
nonTreatedFam <- count(mental[mental$Treatment %in% 0]$Family_history)

treatedFam$freq <- treatedFam$freq/famcount$freq

treatedCountry <- count(mental[mental$Treatment %in% 1]$Country)
countryCount <- count(mental$Country)
treatedCountry$freq <- treatedCountry$freq/countryCount$freq